diff --git a/.circleci/config.yml b/.circleci/config.yml
index 476e9867cf2a4a..1b328b21fe5bfa 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,15 +1,19 @@
+# IMPORTANT: To update Docker image version, please search and update ":{previous_version}"
+# in this file to the new version number, and **ALSO** update the version number below:
+# PyTorchDockerVersion:251
+# Caffe2DockerVersion:206
+
 docker_config_defaults: &docker_config_defaults
   user: jenkins
   aws_auth:
-    # This IAM user only allows read-only access to ECR
-    aws_access_key_id: AKIAJ2J6FIG5OSZTQ3IA
-    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
+    # This IAM user only allows read-write access to ECR
+    aws_access_key_id: AKIAI43PKLK3PGLUWQMA
+    aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE}
 
 # NOTE: We only perform the merge in build step and not in test step, because
 # all source files will be shared from build to test
 merge_pull_request_onto_master: &merge_pull_request_onto_master
   name: Merge Onto Master
-  no_output_timeout: "10h"
   command: |
     if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
       git config --global user.email "circleci.ossci@gmail.com"
@@ -29,6 +33,56 @@ merge_pull_request_onto_master: &merge_pull_request_onto_master
       git merge --no-edit --no-ff ${GIT_MERGE_TARGET}
     fi
 
+setup_ci_environment: &setup_ci_environment
+  name: Set Up CI Environment
+  no_output_timeout: "1h"
+  command: |
+    set -ex
+    sudo pip install awscli
+
+    sudo apt-get update
+    sudo apt-get remove linux-image-generic linux-headers-generic linux-generic
+    sudo apt-get install linux-headers-$(uname -r)
+    sudo apt-get install linux-image-generic
+
+    curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+    echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+    echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+    echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+    sudo apt-get update
+    sudo apt-get install -y nvidia-docker2
+    sudo pkill -SIGHUP dockerd
+
+    if [[ "${JOB_BASE_NAME}" == *-test* ]]; then
+      if [ -n "${CUDA_VERSION}" ]; then
+        wget 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-396.26.run'
+        sudo /bin/bash ./NVIDIA-Linux-x86_64-396.26.run -s --no-drm
+        nvidia-smi
+      fi
+    fi
+
+    if [[ "${JOB_BASE_NAME}" == *-build ]]; then
+      echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
+      echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env
+      echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
+      if [ -n "${CUDA_VERSION}" ]; then
+        echo "declare -x TORCH_CUDA_ARCH_LIST=5.2" >> /home/circleci/project/env
+      fi
+      export SCCACHE_MAX_JOBS=`expr $(nproc) - 1`
+      export MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
+      export MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
+      echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
+
+      # This IAM user allows write access to S3 bucket for sccache
+      echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
+      echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
+    fi
+
+    # This IAM user only allows read-write access to ECR
+    export AWS_ACCESS_KEY_ID=AKIAI43PKLK3PGLUWQMA
+    export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE}
+    eval $(aws ecr get-login --region us-east-1 --no-include-email)
+
 pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
   resource_class: large
   working_directory: /var/lib/jenkins/workspace
@@ -37,8 +91,8 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
   - run:
       <<: *merge_pull_request_onto_master
   - run:
-      name: Build
-      no_output_timeout: "10h"
+      name: Build And Test
+      no_output_timeout: "1h"
       command: |
         export IN_CIRCLECI=1
         export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
@@ -54,38 +108,40 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
 
 pytorch_linux_build_defaults: &pytorch_linux_build_defaults
   resource_class: large
-  working_directory: /var/lib/jenkins/workspace
+  machine:
+    image: default
   steps:
   - checkout
   - run:
       <<: *merge_pull_request_onto_master
+  - run:
+      <<: *setup_ci_environment
   - run:
       name: Build
-      no_output_timeout: "10h"
+      no_output_timeout: "1h"
       command: |
-        export IN_CIRCLECI=1
-        export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
-        if [ -n "${CUDA_VERSION}" ]; then
-          export TORCH_CUDA_ARCH_LIST=5.2
-        fi
-        export SCCACHE_MAX_JOBS=`expr $(nproc) - 1`
-        export MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
-        export MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
-        # This IAM user allows write access to S3 bucket for sccache
-        export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
-        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
+        # Pull Docker image and run build
+        docker pull ${DOCKER_IMAGE}
+        export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
+
         git submodule sync && git submodule update --init
-        .jenkins/pytorch/build.sh
-        export PYTORCH_CI_ENV_DIR=/var/lib/jenkins/pytorch-ci-env
-        mkdir -p ${PYTORCH_CI_ENV_DIR}
-        cp -r /var/lib/jenkins/workspace ${PYTORCH_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
-        cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch ${PYTORCH_CI_ENV_DIR}/torch
-        cp -r build/bin ${PYTORCH_CI_ENV_DIR}/cpp_test_bin
-        if [ -d "../cpp-build" ]; then
-          cp -r ../cpp-build ${PYTORCH_CI_ENV_DIR}/cpp-build
+
+        docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
+
+        (echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/build.sh') | docker exec -u jenkins -i "$id" bash
+
+        # Push intermediate Docker image and save COMMIT_DOCKER_IMAGE for next phase to use
+        mkdir -p /home/circleci/project/pytorch-ci-env
+        touch /home/circleci/project/pytorch-ci-env/.tmp  # Dummy file so that the persist_to_workspace step won't complain about not finding any file
+
+        if [ -z "${BUILD_ONLY}" ]; then
+          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
+          docker commit "$id" ${COMMIT_DOCKER_IMAGE}
+          docker push ${COMMIT_DOCKER_IMAGE}
+          echo "declare -x COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}" > /home/circleci/project/pytorch-ci-env/COMMIT_DOCKER_IMAGE
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/pytorch-ci-env
+      root: /home/circleci/project/pytorch-ci-env
       paths:
         - "*"
 
@@ -96,90 +152,73 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
   - run:
       name: Prepare workspace
       command: |
-        sudo mkdir -p /opt/workspace
-        sudo chmod -R 777 /opt/workspace
+        sudo mkdir -p /home/circleci/project/pytorch-ci-env
+        sudo chmod -R 777 /home/circleci/project/pytorch-ci-env
   - attach_workspace:
-      at: /opt/workspace
+      at: /home/circleci/project/pytorch-ci-env
   - run:
-      name: Build
-      no_output_timeout: "10h"
+      <<: *setup_ci_environment
+  - run:
+      name: Test
+      no_output_timeout: "1h"
       command: |
-        set -x
-        sudo pip install awscli
-        if [ -n "${CUDA_VERSION}" ]; then
-          curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-          echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-          echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-          echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-        fi
-        sudo apt-get update
-        sudo apt-get remove linux-image-generic linux-headers-generic linux-generic
-        sudo apt-get install linux-headers-$(uname -r)
-        sudo apt-get install linux-image-generic
-        if [ -n "${CUDA_VERSION}" ]; then
-          wget 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-396.26.run'
-          sudo /bin/bash ./NVIDIA-Linux-x86_64-396.26.run -s --no-drm
-          sudo apt-get install -y nvidia-docker2
-        fi
-        sudo pkill -SIGHUP dockerd
-        if [ -n "${CUDA_VERSION}" ]; then
-          nvidia-smi
-        fi
-        # This IAM user only allows read-only access to ECR
-        export AWS_ACCESS_KEY_ID=AKIAJ2J6FIG5OSZTQ3IA
-        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
-        eval $(aws ecr get-login --region us-east-1 --no-include-email)
-        docker pull ${DOCKER_IMAGE}
+        source /home/circleci/project/pytorch-ci-env/COMMIT_DOCKER_IMAGE
+        docker pull ${COMMIT_DOCKER_IMAGE}
         if [ -n "${CUDA_VERSION}" ]; then
-          id=$(docker run --runtime=nvidia -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
+          id=$(docker run --runtime=nvidia -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
         else
-          id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
-        fi
-        pwd
-
-        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
-
-        echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
-        echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env
-        echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
-        # This IAM user allows write access to S3 bucket for sccache
-        echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
-        echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
-
-        mkdir -p /home/circleci/project/build
-        cp -r /opt/workspace/cpp_test_bin /home/circleci/project/build/bin
-        docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
-        echo "mkdir -p /opt/conda/lib/python${PYTHON_VERSION}/site-packages" | docker exec -u jenkins -i "$id" bash
-        docker cp "/opt/workspace/torch" "$id:/opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch"
-        if [ -d "/opt/workspace/cpp-build" ]; then
-          docker cp "/opt/workspace/cpp-build" "$id:/var/lib/jenkins/cpp-build"
+          id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
         fi
         if [ -n "${MULTI_GPU}" ]; then
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
         else
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
         fi
 
 caffe2_linux_build_defaults: &caffe2_linux_build_defaults
   resource_class: large
-  working_directory: /var/lib/jenkins/workspace
+  machine:
+    image: default
   steps:
   - checkout
   - run:
       <<: *merge_pull_request_onto_master
   - run:
       name: Build
-      no_output_timeout: "10h"
+      no_output_timeout: "1h"
       command: |
-        export IN_CIRCLECI=1
-        export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
-        # This IAM user allows write access to S3 bucket for sccache
-        export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
-        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
+        set -ex
+        sudo pip install awscli
+
+        sudo apt-get update
+        sudo apt-get remove linux-image-generic linux-headers-generic linux-generic
+        sudo apt-get install linux-headers-$(uname -r)
+        sudo apt-get install linux-image-generic
+
+        curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+        echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        sudo apt-get update
+        sudo apt-get install -y nvidia-docker2
+
+        echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
+        echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
+        if [ -n "${CUDA_VERSION}" ]; then
+          echo "declare -x TORCH_CUDA_ARCH_LIST=5.2" >> /home/circleci/project/env
+        fi
         export SCCACHE_MAX_JOBS=`expr $(nproc) - 1`
         export MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
         export MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
+        echo "declare -x MAX_JOBS=${MAX_JOBS}" >> /home/circleci/project/env
 
+        # This IAM user allows write access to S3 bucket for sccache
+        echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
+        echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
+
+        # TODO: merge this into Caffe2 build.sh
+        cat >/home/circleci/project/ci_build_script.sh <<EOL
+        # =================== The following code will be executed inside Docker container ===================
         set -ex
 
         # Need to checkout fetch PRs for onnxbot tracking PRs
@@ -189,12 +228,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         # Reinitialize submodules
         git submodule sync && git submodule update --init --recursive
 
-        # Ensure jenkins can write to the ccache root dir.
-        sudo chown jenkins:jenkins "${HOME}/.ccache"
-
-        # Make ccache log to the workspace, so we can archive it after the build
         mkdir -p build
-        ccache -o log_file=$PWD/build/ccache.log
 
         # Configure additional cmake arguments
         cmake_args=()
@@ -222,20 +256,32 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         if pgrep sccache > /dev/null; then
           sccache --show-stats
         fi
+        # =================== The above code will be executed inside Docker container ===================
+        EOL
+        chmod +x /home/circleci/project/ci_build_script.sh
 
-        # Copy all necessary binaries to shared workspace
-        export CAFFE2_CI_ENV_DIR=/var/lib/jenkins/caffe2-ci-env
-        mkdir -p ${CAFFE2_CI_ENV_DIR}
-        cp -r /var/lib/jenkins/workspace ${CAFFE2_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
-        cp -r third_party/onnx ${CAFFE2_CI_ENV_DIR}/onnx
-        if [ -d "/usr/local/caffe2" ]; then
-          cp -r /usr/local/caffe2 ${CAFFE2_CI_ENV_DIR}/caffe2
-        fi
-        if [ -d "/opt/conda" ]; then
-          cp -r /opt/conda ${CAFFE2_CI_ENV_DIR}/conda_env
+        sudo pkill -SIGHUP dockerd
+        # This IAM user only allows read-write access to ECR
+        export AWS_ACCESS_KEY_ID=AKIAI43PKLK3PGLUWQMA
+        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE}
+        eval $(aws ecr get-login --region us-east-1 --no-include-email)
+        docker pull ${DOCKER_IMAGE}
+        export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
+        docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
+
+        (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh') | docker exec -u jenkins -i "$id" bash
+
+        mkdir -p /home/circleci/project/caffe2-ci-env
+        touch /home/circleci/project/caffe2-ci-env/.tmp  # Dummy file so that the persist_to_workspace step won't complain about not finding any file
+
+        if [ -z "${BUILD_ONLY}" ]; then
+          export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-${CIRCLE_SHA1}
+          docker commit "$id" ${COMMIT_DOCKER_IMAGE}
+          docker push ${COMMIT_DOCKER_IMAGE}
+          echo "declare -x COMMIT_DOCKER_IMAGE=${COMMIT_DOCKER_IMAGE}" > /home/circleci/project/caffe2-ci-env/COMMIT_DOCKER_IMAGE
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/caffe2-ci-env
+      root: /home/circleci/project/caffe2-ci-env
       paths:
         - "*"
 
@@ -246,56 +292,40 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
   - run:
       name: Prepare workspace
       command: |
-        sudo mkdir -p /opt/workspace
-        sudo chmod -R 777 /opt/workspace
+        sudo mkdir -p /home/circleci/project/caffe2-ci-env
+        sudo chmod -R 777 /home/circleci/project/caffe2-ci-env
   - attach_workspace:
-      at: /opt/workspace
+      at: /home/circleci/project/caffe2-ci-env
   - run:
-      name: Build
-      no_output_timeout: "10h"
+      name: Test
+      no_output_timeout: "1h"
       command: |
         set -x
         sudo pip install awscli
-        if [ -n "${CUDA_VERSION}" ]; then
-          curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-          echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-          echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-          echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
-        fi
+
         sudo apt-get update
         sudo apt-get remove linux-image-generic linux-headers-generic linux-generic
         sudo apt-get install linux-headers-$(uname -r)
         sudo apt-get install linux-image-generic
+
+        curl -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+        echo "deb https://nvidia.github.io/libnvidia-container/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        echo "deb https://nvidia.github.io/nvidia-container-runtime/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        echo "deb https://nvidia.github.io/nvidia-docker/ubuntu14.04/amd64 /" | sudo tee -a /etc/apt/sources.list.d/nvidia-docker.list
+        sudo apt-get update
+        sudo apt-get install -y nvidia-docker2
+
         if [ -n "${CUDA_VERSION}" ]; then
           wget 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-396.26.run'
           sudo /bin/bash ./NVIDIA-Linux-x86_64-396.26.run -s --no-drm
-          sudo apt-get install -y nvidia-docker2
         fi
         sudo pkill -SIGHUP dockerd
         if [ -n "${CUDA_VERSION}" ]; then
           nvidia-smi
         fi
-        # This IAM user only allows read-only access to ECR
-        export AWS_ACCESS_KEY_ID=AKIAJ2J6FIG5OSZTQ3IA
-        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
-        eval $(aws ecr get-login --region us-east-1 --no-include-email)
-        docker pull ${DOCKER_IMAGE}
-        if [ -n "${CUDA_VERSION}" ]; then
-          id=$(docker run --runtime=nvidia -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
-        else
-          id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
-        fi
-        pwd
-        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
-        echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
-        echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
-        # This IAM user allows write access to S3 bucket for sccache
-        echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
-        echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
-        echo "declare -x BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" >> /home/circleci/project/env
 
-        # TODO: merge this into Caffe2 build.sh
-        cat >/home/circleci/project/ci_build_script.sh <<EOL
+        # TODO: merge this into Caffe2 test.sh
+        cat >/home/circleci/project/ci_test_script.sh <<EOL
         # =================== The following code will be executed inside Docker container ===================
         set -ex
 
@@ -318,6 +348,9 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
           export PATH=/opt/conda/bin:$PATH
         fi
 
+        # Upgrade SSL module to avoid old SSL warnings
+        pip install --user --upgrade pyOpenSSL ndg-httpsclient pyasn1
+
         pip install --user -b /tmp/pip_install_onnx "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
         pip install --user future
 
@@ -334,18 +367,22 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
         rm -f ./crash/core.logging_test.*
         # =================== The above code will be executed inside Docker container ===================
         EOL
-        chmod +x /home/circleci/project/ci_build_script.sh
-        docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
-        if [ -d "/opt/workspace/caffe2" ]; then
-          echo "mkdir -p /usr/local/caffe2" | docker exec -u jenkins -i "$id" bash
-          docker cp /opt/workspace/caffe2/. "$id:/usr/local/caffe2"
-        fi
-        if [ -d "/opt/workspace/conda_env" ]; then
-          echo "sudo mkdir -p /opt/conda" | docker exec -u jenkins -i "$id" bash
-          docker cp /opt/workspace/conda_env/. "$id:/opt/conda"
+        chmod +x /home/circleci/project/ci_test_script.sh
+
+        source /home/circleci/project/caffe2-ci-env/COMMIT_DOCKER_IMAGE
+        # This IAM user only allows read-write access to ECR
+        export AWS_ACCESS_KEY_ID=AKIAI43PKLK3PGLUWQMA
+        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE}
+        eval $(aws ecr get-login --region us-east-1 --no-include-email)
+        docker pull ${COMMIT_DOCKER_IMAGE}
+        if [ -n "${CUDA_VERSION}" ]; then
+          id=$(docker run --runtime=nvidia -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+        else
+          id=$(docker run -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
         fi
-        docker cp /opt/workspace/onnx/. "$id:/var/lib/jenkins/workspace/third_party/onnx"
-        (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh') | docker exec -u jenkins -i "$id" bash
+        docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
+
+        (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh') | docker exec -u jenkins -i "$id" bash
 
 caffe2_macos_build_defaults: &caffe2_macos_build_defaults
   macos:
@@ -356,7 +393,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
         <<: *merge_pull_request_onto_master
     - run:
         name: Build
-        no_output_timeout: "10h"
+        no_output_timeout: "1h"
         command: |
           set -ex
 
@@ -380,7 +417,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
           # Install Anaconda if we need to
           if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
             rm -rf ${TMPDIR}/anaconda
-            curl -o ${TMPDIR}/anaconda.sh "https://repo.continuum.io/archive/Anaconda${ANACONDA_VERSION}-5.0.1-MacOSX-x86_64.sh"
+            curl -o ${TMPDIR}/anaconda.sh https://repo.continuum.io/miniconda/Miniconda${ANACONDA_VERSION}-latest-MacOSX-x86_64.sh
             /bin/bash ${TMPDIR}/anaconda.sh -b -p ${TMPDIR}/anaconda
             rm -f ${TMPDIR}/anaconda.sh
             export PATH="${TMPDIR}/anaconda/bin:${PATH}"
@@ -427,73 +464,85 @@ version: 2
 jobs:
   pytorch_linux_trusty_py2_7_9_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7.9:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py2.7.9-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_py2_7_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py2.7:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py2.7-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_py3_5_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py3.5-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_py3_6_gcc4_8_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc4.8:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc4.8-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_py3_6_gcc5_4_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc5.4:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc5.4-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_py3_6_gcc7_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.6-gcc7:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-py3.6-gcc7-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_trusty_pynightly_build_test:
     docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:238
+      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-pynightly:251
         <<: *docker_config_defaults
+    environment:
+      JOB_BASE_NAME: pytorch-linux-trusty-pynightly-build-test
     <<: *pytorch_linux_cpu_build_test_defaults
 
   pytorch_linux_xenial_py3_clang5_asan_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:238
-        <<: *docker_config_defaults
     environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:251"
       PYTHON_VERSION: "3.6"
     <<: *pytorch_linux_build_defaults
 
   pytorch_linux_xenial_py3_clang5_asan_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-asan-test
       PYTHON_VERSION: "3.6"
     resource_class: large
     <<: *pytorch_linux_test_defaults
 
   pytorch_linux_xenial_cuda8_cudnn6_py3_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:238
-        <<: *docker_config_defaults
     environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:251"
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "8"
     <<: *pytorch_linux_build_defaults
 
   pytorch_linux_xenial_cuda8_cudnn6_py3_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-test
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "8"
     resource_class: gpu.medium
@@ -501,64 +550,104 @@ jobs:
 
   pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn6-py3:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-multigpu-test
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "8"
       MULTI_GPU: "1"
     resource_class: gpu.large
     <<: *pytorch_linux_test_defaults
 
+  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+    environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX2-test
+      PYTHON_VERSION: "3.6"
+      CUDA_VERSION: "8"
+    resource_class: gpu.medium
+    <<: *pytorch_linux_test_defaults
+
+  pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+    environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda8-cudnn6-py3-NO_AVX-NO_AVX2-test
+      PYTHON_VERSION: "3.6"
+      CUDA_VERSION: "8"
+    resource_class: gpu.medium
+    <<: *pytorch_linux_test_defaults
+
   pytorch_linux_xenial_cuda9_cudnn7_py2_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:238
-        <<: *docker_config_defaults
     environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:251"
       PYTHON_VERSION: "2.7"
       CUDA_VERSION: "9"
     <<: *pytorch_linux_build_defaults
 
   pytorch_linux_xenial_cuda9_cudnn7_py2_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py2-test
       PYTHON_VERSION: "2.7"
       CUDA_VERSION: "9"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
   pytorch_linux_xenial_cuda9_cudnn7_py3_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:238
-        <<: *docker_config_defaults
     environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:251"
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "9"
     <<: *pytorch_linux_build_defaults
 
   pytorch_linux_xenial_cuda9_cudnn7_py3_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9-cudnn7-py3-test
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "9"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
   pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:238
-        <<: *docker_config_defaults
     environment:
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:251"
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "9.2"
     <<: *pytorch_linux_build_defaults
 
   pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7:238"
+      JOB_BASE_NAME: pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
       PYTHON_VERSION: "3.6"
       CUDA_VERSION: "9.2"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
+  pytorch_short_perf_test_gpu:
+    environment:
+      JOB_BASE_NAME: pytorch-short-perf-test-gpu
+      PYTHON_VERSION: "3.6"
+      CUDA_VERSION: "8"
+    resource_class: gpu.medium
+    machine:
+      image: default
+    steps:
+    - run:
+        name: Prepare workspace
+        command: |
+          sudo mkdir -p /home/circleci/project/pytorch-ci-env
+          sudo chmod -R 777 /home/circleci/project/pytorch-ci-env
+    - attach_workspace:
+        at: /home/circleci/project/pytorch-ci-env
+    - run:
+        <<: *setup_ci_environment
+    - run:
+        name: Perf Test
+        no_output_timeout: "1h"
+        command: |
+          source /home/circleci/project/pytorch-ci-env/COMMIT_DOCKER_IMAGE
+          docker pull ${COMMIT_DOCKER_IMAGE}
+          id=$(docker run --runtime=nvidia -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+          (echo "export JOB_BASE_NAME=${JOB_BASE_NAME}" && echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/short-perf-test-gpu.sh') | docker exec -u jenkins -i "$id" bash
+
   pytorch_macos_10_13_py3_build:
     macos:
       xcode: "9.0"
@@ -569,8 +658,9 @@ jobs:
       - run:
           name: Build
           environment:
+            JOB_BASE_NAME: pytorch-macos-10.13-py3-build
             BUILD_ENVIRONMENT: pytorch-macos-10.13-py3
-          no_output_timeout: "10h"
+          no_output_timeout: "1h"
           command: |
             set -ex
 
@@ -608,10 +698,11 @@ jobs:
       - attach_workspace:
           at: /Users/distiller/pytorch-ci-env
       - run:
-          name: Build
+          name: Test
           environment:
+            JOB_BASE_NAME: pytorch-macos-10.13-py3-test
             BUILD_ENVIRONMENT: pytorch-macos-10.13-py3
-          no_output_timeout: "10h"
+          no_output_timeout: "1h"
           command: |
             # TODO: need to share source files from build to test, when macOS builds are enabled
             set -ex
@@ -631,7 +722,7 @@ jobs:
           environment:
             JOB_BASE_NAME: pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
             BUILD_ENVIRONMENT: pytorch-macos-10.13-cuda9.2-cudnn7-py3
-          no_output_timeout: "10h"
+          no_output_timeout: "1h"
           command: |
             set -ex
 
@@ -666,211 +757,185 @@ jobs:
             .jenkins/pytorch/macos-build.sh
 
   caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:206"
       CUDA_VERSION: "8"
       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn6-ubuntu16.04:206"
       CUDA_VERSION: "8"
       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn6-ubuntu16.04"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9"
       BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9"
       BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-ubuntu16.04"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9"
       BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-aten-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9"
       BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-aten-ubuntu16.04"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9.1"
       BUILD_ENVIRONMENT: "py2-cuda9.1-cudnn7-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:206"
       CUDA_VERSION: "9.1"
       BUILD_ENVIRONMENT: "py2-cuda9.1-cudnn7-ubuntu16.04"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_mkl_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-mkl-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_mkl_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-mkl-ubuntu16.04"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_gcc4_8_ubuntu14_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:206"
       BUILD_ENVIRONMENT: "py2-gcc4.8-ubuntu14.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_gcc4_8_ubuntu14_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:206"
       BUILD_ENVIRONMENT: "py2-gcc4.8-ubuntu14.04"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_onnx_py2_gcc5_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "onnx-py2-gcc5-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_onnx_py2_gcc5_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "onnx-py2-gcc5-ubuntu16.04"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_conda2_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda2-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda2-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "conda2-ubuntu16.04"
     <<: *caffe2_linux_build_defaults
 
   caffe2_conda2_ubuntu16_04_test:
     environment:
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda2-ubuntu16.04:190"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda2-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "conda2-ubuntu16.04"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_gcc4_9_ubuntu14_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:206"
       BUILD_ENVIRONMENT: "py2-gcc4.9-ubuntu14.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_clang3_8_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-clang3.8-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_clang3_9_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-clang3.9-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_gcc6_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc6-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc6-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-gcc6-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_gcc7_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc7-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-gcc7-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-cuda8.0-cudnn7-aten-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_android_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "py2-android-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda3-cuda9.0-cudnn7-ubuntu16.04:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/conda3-cuda9.0-cudnn7-ubuntu16.04:206"
       BUILD_ENVIRONMENT: "conda3-cuda9.0-cudnn7-ubuntu16.04"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_centos7_build:
-    docker:
-      - image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:190
-        <<: *docker_config_defaults
     environment:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:206"
       BUILD_ENVIRONMENT: "py2-cuda9.0-cudnn7-centos7"
+      BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_ios_macos10_13_build:
@@ -890,36 +955,45 @@ workflows:
   version: 2
   build:
     jobs:
-      # - pytorch_linux_trusty_py2_7_9_build_test
-      # - pytorch_linux_trusty_py2_7_build_test
-      # - pytorch_linux_trusty_py3_5_build_test
-      # - pytorch_linux_trusty_py3_6_gcc4_8_build_test
-      # - pytorch_linux_trusty_py3_6_gcc5_4_build_test
-      # - pytorch_linux_trusty_py3_6_gcc7_build_test
-      # - pytorch_linux_trusty_pynightly_build_test
-      # - pytorch_linux_xenial_py3_clang5_asan_build
-      # - pytorch_linux_xenial_py3_clang5_asan_test:
-      #     requires:
-      #       - pytorch_linux_xenial_py3_clang5_asan_build
-      # - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      # - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
-      #     requires:
-      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      # - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
-      #     requires:
-      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      # - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      # - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
-      #     requires:
-      #       - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      # - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      # - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
-      #     requires:
-      #       - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
-      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
-      #     requires:
-      #       - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      - pytorch_linux_trusty_py2_7_9_build_test
+      - pytorch_linux_trusty_py2_7_build_test
+      - pytorch_linux_trusty_py3_5_build_test
+      - pytorch_linux_trusty_py3_6_gcc4_8_build_test
+      - pytorch_linux_trusty_py3_6_gcc5_4_build_test
+      - pytorch_linux_trusty_py3_6_gcc7_build_test
+      - pytorch_linux_trusty_pynightly_build_test
+      - pytorch_linux_xenial_py3_clang5_asan_build
+      - pytorch_linux_xenial_py3_clang5_asan_test:
+          requires:
+            - pytorch_linux_xenial_py3_clang5_asan_build
+      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+          requires:
+            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+          requires:
+            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX2_test:
+          requires:
+            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda8_cudnn6_py3_NO_AVX_NO_AVX2_test:
+          requires:
+            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_short_perf_test_gpu:
+          requires:
+            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
+          requires:
+            - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
+          requires:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
+          requires:
+            - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
 
       # - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_py3_test:
@@ -927,48 +1001,35 @@ workflows:
       #       - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 
-      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      # - caffe2_py2_mkl_ubuntu16_04_build
-      # - caffe2_py2_mkl_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_py2_mkl_ubuntu16_04_build
-      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_gcc4_8_ubuntu14_04_build
-      # - caffe2_py2_gcc4_8_ubuntu14_04_test:
-      #     requires:
-      #       - caffe2_py2_gcc4_8_ubuntu14_04_build
-      # - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      # - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      # - caffe2_conda2_ubuntu16_04_build
-      # - caffe2_conda2_ubuntu16_04_test:
-      #     requires:
-      #       - caffe2_conda2_ubuntu16_04_build
-      # - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_gcc4_9_ubuntu14_04_build
-      # - caffe2_py2_clang3_8_ubuntu16_04_build
-      # - caffe2_py2_clang3_9_ubuntu16_04_build
-      # - caffe2_py2_gcc6_ubuntu16_04_build
-      # - caffe2_py2_gcc7_ubuntu16_04_build
-      # - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build
-      # - caffe2_py2_android_ubuntu16_04_build
-      # - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build
-      # - caffe2_py2_cuda9_0_cudnn7_centos7_build
+      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
+          requires:
+            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
+          requires:
+            - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
+          requires:
+            - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      - caffe2_py2_mkl_ubuntu16_04_build
+      - caffe2_py2_mkl_ubuntu16_04_test:
+          requires:
+            - caffe2_py2_mkl_ubuntu16_04_build
+      - caffe2_py2_gcc4_8_ubuntu14_04_build
+      - caffe2_py2_gcc4_8_ubuntu14_04_test:
+          requires:
+            - caffe2_py2_gcc4_8_ubuntu14_04_build
+      - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
+          requires:
+            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
+      - caffe2_py2_clang3_8_ubuntu16_04_build
+      - caffe2_py2_clang3_9_ubuntu16_04_build
+      - caffe2_py2_android_ubuntu16_04_build
+      - caffe2_py2_cuda9_0_cudnn7_centos7_build
 
       # - caffe2_py2_ios_macos10_13_build
       # - caffe2_py2_system_macos10_13_build
diff --git a/.gitmodules b/.gitmodules
index 2d6db6471591a3..2bb7b5e0d3e3eb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -76,3 +76,6 @@
 [submodule "third_party/ideep"]
 	path = third_party/ideep
 	url = https://github.com/intel/ideep
+[submodule "third_party/nccl/nccl"]
+	path = third_party/nccl/nccl
+	url = https://github.com/NVIDIA/nccl
diff --git a/README.md b/README.md
index 37d0afdfd5771a..7d4fa43f70a49d 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ You get the best of speed and flexibility for your crazy research.
 
 PyTorch is not a Python binding into a monolithic C++ framework.
 It is built to be deeply integrated into Python.
-You can use it naturally like you would use NumPy / SciPy / scikit-learn etc.
+You can use it naturally like you would use [NumPy](http://www.numpy.org/) / [SciPy](https://www.scipy.org/) / [scikit-learn](http://scikit-learn.org) etc.
 You can write your new neural network layers in Python itself, using your favorite libraries
 and use packages such as Cython and Numba.
 Our goal is to not reinvent the wheel where appropriate.
@@ -104,7 +104,7 @@ We hope you never spend hours debugging your code because of bad stack traces or
 ### Fast and Lean
 
 PyTorch has minimal framework overhead. We integrate acceleration libraries
-such as Intel MKL and NVIDIA (cuDNN, NCCL) to maximize speed.
+such as [Intel MKL](https://software.intel.com/mkl) and NVIDIA (cuDNN, NCCL) to maximize speed.
 At the core, its CPU and GPU Tensor and neural network backends
 (TH, THC, THNN, THCUNN) are mature and have been tested for years.
 
@@ -226,7 +226,7 @@ should increase shared memory size either with `--ipc=host` or `--shm-size` comm
 
 ### Building the Documentation
 
-To build documentation in various formats, you will need Sphinx and the
+To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org) and the
 readthedocs theme.
 
 ```
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 1955d07b630d74..c83bd0e53a8a31 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -780,7 +780,7 @@ def emit_nn_body(option):
             # _out variants must create buffers and insert them in the
             # arguments list between output and input arguments
             for buffer in option['buffers']:
-                body.append('Tensor {} = tensor();'.format(buffer['name']))
+                body.append('Tensor {} = at::empty({{0}}, this->options());'.format(buffer['name']))
             actuals = [arg['name'] for arg in option['arguments'] if arg.get('output')]
             actuals += [buffer['name'] for buffer in option['buffers']]
             actuals += [arg['name'] for arg in option['arguments'] if not arg.get('output')]
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 082ca4999a07df..c1e837ade377f2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1901,9 +1901,6 @@
     SparseCPU: new_with_size_sparse
     SparseCUDA: new_with_size_sparse
 
-- func: tensor(Type dtype) -> Tensor
-  variants: []
-
 - func: tensor(Type dtype, IntList size) -> Tensor
   variants: []
 
diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
index 048e151c2c02eb..579aae3e1b924d 100644
--- a/binaries/bench_gen/bench_gen.py
+++ b/binaries/bench_gen/bench_gen.py
@@ -6,6 +6,7 @@
 from __future__ import unicode_literals
 
 import argparse
+import ast
 
 from caffe2.python.model_helper import ModelHelper
 from caffe2.python.predictor import mobile_exporter
@@ -15,18 +16,15 @@
 def parse_kwarg(kwarg_str):
     key, value = kwarg_str.split('=')
     try:
-        value = int(value)
+        value = ast.literal_eval(value)
     except ValueError:
-        try:
-            value = float(value)
-        except ValueError:
-            pass
+        pass
     return key, value
 
 
 def main(args):
     # User defined keyword arguments
-    kwargs = {"order": "NCHW"}
+    kwargs = {"order": "NCHW", "use_cudnn": False}
     kwargs.update(dict(args.kwargs))
 
     model = ModelHelper(name=args.benchmark_name)
diff --git a/c10/test/registry_test.cpp b/c10/test/registry_test.cpp
index c6e7f620e602b5..5936195f2f8516 100644
--- a/c10/test/registry_test.cpp
+++ b/c10/test/registry_test.cpp
@@ -13,6 +13,7 @@ class Foo {
   explicit Foo(int x) {
     // LOG(INFO) << "Foo " << x;
   }
+  virtual ~Foo() {}
 };
 
 C10_DECLARE_REGISTRY(FooRegistry, Foo, int);
@@ -46,4 +47,45 @@ TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
   EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
 }
 
+// C10_REGISTER_CLASS_WITH_PRIORITY defines static variable
+void RegisterFooDefault() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
+}
+
+void RegisterFooDefaultAgain() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
+}
+
+void RegisterFooBarFallback() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_FALLBACK, Bar);
+}
+
+void RegisterFooBarPreferred() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_PREFERRED, Bar);
+}
+
+TEST(RegistryTest, RegistryPriorities) {
+  FooRegistry()->SetTerminate(false);
+  RegisterFooDefault();
+
+  // throws because Foo is already registered with default priority
+  EXPECT_THROW(RegisterFooDefaultAgain(), std::runtime_error);
+
+#ifdef __GXX_RTTI
+  // not going to register Bar because Foo is registered with Default priority
+  RegisterFooBarFallback();
+  std::unique_ptr<Foo> bar1(FooRegistry()->Create("FooWithPriority", 1));
+  EXPECT_EQ(dynamic_cast<Bar*>(bar1.get()), nullptr);
+
+  // will register Bar because of higher priority
+  RegisterFooBarPreferred();
+  std::unique_ptr<Foo> bar2(FooRegistry()->Create("FooWithPriority", 1));
+  EXPECT_NE(dynamic_cast<Bar*>(bar2.get()), nullptr);
+#endif
+}
+
 } // namespace c10_test
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
index 9f310c73483263..7a6748274be05d 100644
--- a/c10/util/Registry.h
+++ b/c10/util/Registry.h
@@ -24,15 +24,21 @@
 namespace c10 {
 
 template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
+inline std::string KeyStrRepr(const KeyType& /*key*/) {
+  return "[key type printing not supported]";
 }
 
 template <>
-inline void PrintOffendingKey(const std::string& key) {
-  printf("Offending key: %s.\n", key.c_str());
+inline std::string KeyStrRepr(const std::string& key) {
+  return key;
 }
 
+enum RegistryPriority {
+  REGISTRY_FALLBACK = 1,
+  REGISTRY_DEFAULT = 2,
+  REGISTRY_PREFERRED = 3,
+};
+
 /**
  * @brief A template class that allows one to register classes by keys.
  *
@@ -48,9 +54,12 @@ class Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
-  Registry() : registry_() {}
+  Registry() : registry_(), priority_(), terminate_(true) {}
 
-  void Register(const SrcType& key, Creator creator) {
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
     std::lock_guard<std::mutex> lock(register_mutex_);
     // The if statement below is essentially the same as the following line:
     // CHECK_EQ(registry_.count(key), 0) << "Key " << key
@@ -59,18 +68,40 @@ class Registry {
     // carried out at static initialization time, we do not want to have an
     // explicit dependency on glog's initialization function.
     if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      PrintOffendingKey(key);
-      std::exit(1);
+      auto cur_priority = priority_[key];
+      if (priority > cur_priority) {
+        std::string warn_msg =
+            "Overwriting already registered item for key " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+        registry_[key] = creator;
+        priority_[key] = priority;
+      } else if (priority == cur_priority) {
+        std::string err_msg =
+            "Key already registered with the same priority: " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", err_msg.c_str());
+        if (terminate_) {
+          std::exit(1);
+        } else {
+          throw std::runtime_error(err_msg);
+        }
+      } else {
+        std::string warn_msg =
+            "Higher priority item already registered, skipping registration of " +
+            KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+      }
+    } else {
+      registry_[key] = creator;
+      priority_[key] = priority;
     }
-    registry_[key] = creator;
   }
 
   void Register(
       const SrcType& key,
       Creator creator,
-      const std::string& help_msg) {
-    Register(key, creator);
+      const std::string& help_msg,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
+    Register(key, creator, priority);
     help_message_[key] = help_msg;
   }
 
@@ -109,8 +140,16 @@ class Registry {
     return it->second.c_str();
   }
 
+  // Used for testing, if terminate is unset, Registry throws instead of
+  // calling std::exit
+  void SetTerminate(bool terminate) {
+    terminate_ = terminate;
+  }
+
  private:
   std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, RegistryPriority> priority_;
+  bool terminate_;
   std::unordered_map<SrcType, std::string> help_message_;
   std::mutex register_mutex_;
 
@@ -120,7 +159,7 @@ class Registry {
 template <class SrcType, class ObjectPtrType, class... Args>
 class Registerer {
  public:
-  Registerer(
+  explicit Registerer(
       const SrcType& key,
       Registry<SrcType, ObjectPtrType, Args...>* registry,
       typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
@@ -128,6 +167,15 @@ class Registerer {
     registry->Register(key, creator, help_msg);
   }
 
+  explicit Registerer(
+      const SrcType& key,
+      const RegistryPriority priority,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg, priority);
+  }
+
   template <class DerivedType>
   static ObjectPtrType DefaultCreator(Args... args) {
     return ObjectPtrType(new DerivedType(args...));
@@ -187,6 +235,11 @@ class Registerer {
   static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
       key, RegistryName(), ##__VA_ARGS__);
 
+#define C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                           \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, priority, RegistryName(), ##__VA_ARGS__);
+
 #define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
   static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
       key,                                                                  \
@@ -194,6 +247,15 @@ class Registerer {
       Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
       ::c10::demangle_type<__VA_ARGS__>());
 
+#define C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                             \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      priority,                                                             \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
 // C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use
 // std::string as the key type, because that is the most commonly used cases.
 #define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
@@ -218,9 +280,17 @@ class Registerer {
 #define C10_REGISTER_CREATOR(RegistryName, key, ...) \
   C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
 
+#define C10_REGISTER_CREATOR_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
 #define C10_REGISTER_CLASS(RegistryName, key, ...) \
   C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
 
+#define C10_REGISTER_CLASS_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
 } // namespace c10
 
 #endif // C10_UTIL_REGISTRY_H_
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 70843bb0d91108..86bab6f708d723 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -237,12 +237,7 @@ def find_factory_methods(decls):
         }
         defined_inferred_type = False
 
-        if 'Tensor' in o['method_of']:
-            # make sure 'self' is the first argument. currently Declarations.yaml
-            # does not always do this. Instead it keeps the argument list the same order
-            # as the Type method.
-            o['arguments'] = self_as_first_argument(o['arguments'])
-        elif 'namespace' not in o['method_of']:
+        if 'namespace' not in o['method_of'] and 'Tensor' not in o['method_of']:
             # methods on type like 'ones' or 'zeros' always take a
             # string attribute that is translated into the at::Type object
             # e.g. "Float" is at::kFloat
@@ -289,11 +284,11 @@ def find_factory_methods(decls):
             assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
             env['assignments'].append(assignment)
 
-        if 'Tensor' in o['method_of']:
+        if 'namespace' in o['method_of']:
+            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
+        elif 'Tensor' in o['method_of']:
             env['invocation'] = "self.{}({})".format(
                 o['name'], ', '.join(env['arguments'][1:]))
-        elif 'namespace' in o['method_of']:
-            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
         else:
             assert('Type' in o['method_of'])
             env['invocation'] = CT(
diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
index a5e638fbca02b7..9d3d5229465c39 100644
--- a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
+++ b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
@@ -23,7 +23,7 @@ C10_DEFINE_int(
 
 namespace caffe2 {
 
-std::shared_ptr<TaskThreadPool>
+std::shared_ptr<TaskThreadPoolBase>
 GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) {
   // For GPU, use per device thread pools of predefined constant size
   if (pool_size != c10::FLAGS_caffe2_threads_per_hip_gpu) {
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index cff40845e439a1..37f38ceaf0bcee 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -173,7 +173,7 @@ unique_ptr<NetBase> CreateNet(
   return net;
 }
 
-TaskThreadPool* ExecutorHelper::GetPool(
+TaskThreadPoolBase* ExecutorHelper::GetPool(
     const DeviceOption& /* unused */) const {
   CAFFE_THROW("Not implemented");
 }
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 5722f2b2cc4f0c..7b89f8bff04f74 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -130,7 +130,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
 class CAFFE2_API ExecutorHelper {
  public:
   ExecutorHelper() {}
-  virtual TaskThreadPool* GetPool(const DeviceOption& option) const;
+  virtual TaskThreadPoolBase* GetPool(const DeviceOption& option) const;
   virtual ~ExecutorHelper() {}
 };
 
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index 13b9c2ac659aaf..232b89dac1d048 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -13,6 +13,11 @@ C10_DEFINE_int(
 
 C10_DECLARE_bool(caffe2_dag_net_collect_stats);
 
+C10_DEFINE_bool(
+    caffe2_net_async_inference_mode,
+    false,
+    "If set, use one single chain containing all ops");
+
 C10_DEFINE_bool(
     caffe2_net_async_finish_chain,
     false,
@@ -73,7 +78,11 @@ AsyncNetBase::AsyncNetBase(
     operators_.push_back(op_ptr);
   }
 
-  execution_chains_ = dag_utils::computeChains(operator_nodes_);
+  if (c10::FLAGS_caffe2_net_async_inference_mode) {
+    execution_chains_ = dag_utils::computeGroups(operator_nodes_);
+  } else {
+    execution_chains_ = dag_utils::computeChains(operator_nodes_);
+  }
   chains_.reserve(execution_chains_.size());
   for (const auto& kv : execution_chains_) {
     chains_.push_back(kv.second);
@@ -119,7 +128,7 @@ bool AsyncNetBase::RunAsync() {
   return DoRunAsync();
 }
 
-TaskThreadPool* AsyncNetBase::poolGetter(
+TaskThreadPoolBase* AsyncNetBase::poolGetter(
     PoolsMap& pools,
     int device_type,
     int device_id,
@@ -134,7 +143,7 @@ TaskThreadPool* AsyncNetBase::poolGetter(
   return pool.get();
 }
 
-TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) {
+TaskThreadPoolBase* AsyncNetBase::pool(const DeviceOption& device_option) {
   if (use_single_pool_) {
     return poolGetter(cpu_pools_, PROTO_CPU, -1, num_workers_);
   }
@@ -418,53 +427,17 @@ void AsyncNetBase::finalizeEvents() {
 
 AsyncNetBase::~AsyncNetBase() {}
 
-C10_DEFINE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
-
-C10_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool);
-
-/* static */
-std::shared_ptr<TaskThreadPool>
-GetAsyncNetCPUThreadPool(int numa_node_id, int pool_size, bool create_new) {
-  // Note: numa_node_id = -1 corresponds to no NUMA used
-  static std::
-      unordered_map<int, std::unordered_map<int, std::weak_ptr<TaskThreadPool>>>
-          pools;
-  static std::mutex pool_mutex;
-
-  if (pool_size <= 0) {
-    if (c10::FLAGS_caffe2_net_async_cpu_pool_size > 0) {
-      pool_size = c10::FLAGS_caffe2_net_async_cpu_pool_size;
-      LOG(INFO) << "Using default CPU pool size: " << pool_size
-                << "; NUMA node id: " << numa_node_id;
-    } else {
-      auto num_cores = std::thread::hardware_concurrency();
-      CAFFE_ENFORCE(num_cores > 0, "Failed to get number of CPU cores");
-      LOG(INFO) << "Using estimated CPU pool size: " << num_cores
-                << "; NUMA node id: " << numa_node_id;
-      pool_size = num_cores;
-    }
-  } else {
-    LOG(INFO) << "Using specified CPU pool size: " << pool_size
-              << "; NUMA node id: " << numa_node_id;
-  }
-
-  if (create_new) {
-    LOG(INFO) << "Created new CPU pool, size: " << pool_size
-              << "; NUMA node id: " << numa_node_id;
-    return std::make_shared<TaskThreadPool>(pool_size, numa_node_id);
-  } else {
-    std::lock_guard<std::mutex> lock(pool_mutex);
-
-    auto shared_pool = pools[numa_node_id][pool_size].lock();
-    if (!shared_pool) {
-      LOG(INFO) << "Created shared CPU pool, size: " << pool_size
-                << "; NUMA node id: " << numa_node_id;
-      shared_pool = std::make_shared<TaskThreadPool>(pool_size, numa_node_id);
-      pools[numa_node_id][pool_size] = shared_pool;
-    }
-    return shared_pool;
-  }
-}
+C10_DEFINE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPoolBase,
+    int,
+    int,
+    bool);
+
+C10_REGISTER_CREATOR(
+    ThreadPoolRegistry,
+    CPU,
+    GetAsyncNetCPUThreadPool<TaskThreadPool>);
 
 void AsyncNetBase::computeExecutionModeFlags() {
   static const std::string kDag = "dag";
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 91068dbff74d10..f8d61f7c9ad361 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -75,7 +75,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
       const std::vector<int>& wait_task_ids) const;
   bool run(int task_id, int stream_id);
   int stream(int task_id);
-  TaskThreadPool* pool(const DeviceOption& device_option);
+  TaskThreadPoolBase* pool(const DeviceOption& device_option);
 
   void finishTasks(const std::unordered_set<int>& task_ids);
   void finalizeEvents();
@@ -98,7 +98,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   // first int key - device id, second - pool size, one pool per (device, size)
   typedef std::unordered_map<
       int,
-      std::unordered_map<int, std::shared_ptr<TaskThreadPool>>>
+      std::unordered_map<int, std::shared_ptr<TaskThreadPoolBase>>>
       PoolsMap;
   PoolsMap cpu_pools_;
   PoolsMap gpu_pools_;
@@ -132,7 +132,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
  private:
   void storeExceptionPtr();
 
-  TaskThreadPool*
+  TaskThreadPoolBase*
   poolGetter(PoolsMap& pools, int device_type, int device_id, int pool_size);
 
   std::unique_ptr<AsyncNetExecutorHelper> helper_;
@@ -141,12 +141,17 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   friend class tracing::Tracer;
 };
 
-C10_DECLARE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool);
+C10_DECLARE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPoolBase,
+    int,
+    int,
+    bool);
 
 class AsyncNetExecutorHelper : public ExecutorHelper {
  public:
   explicit AsyncNetExecutorHelper(AsyncNetBase* net) : net_(net) {}
-  TaskThreadPool* GetPool(const DeviceOption& option) const override {
+  TaskThreadPoolBase* GetPool(const DeviceOption& option) const override {
     return net_->pool(option);
   }
 
@@ -154,8 +159,51 @@ class AsyncNetExecutorHelper : public ExecutorHelper {
   AsyncNetBase* net_;
 };
 
-std::shared_ptr<TaskThreadPool>
-GetAsyncNetCPUThreadPool(int numa_node_id, int pool_size, bool create_new);
+template <class TaskThreadPoolImpl>
+std::shared_ptr<TaskThreadPoolBase>
+GetAsyncNetCPUThreadPool(int numa_node_id, int pool_size, bool create_new) {
+  // Note: numa_node_id = -1 corresponds to no NUMA used
+  static std::unordered_map<
+      int,
+      std::unordered_map<int, std::weak_ptr<TaskThreadPoolBase>>>
+      pools;
+  static std::mutex pool_mutex;
+
+  if (pool_size <= 0) {
+    if (c10::FLAGS_caffe2_net_async_cpu_pool_size > 0) {
+      pool_size = c10::FLAGS_caffe2_net_async_cpu_pool_size;
+      LOG(INFO) << "Using default CPU pool size: " << pool_size
+                << "; NUMA node id: " << numa_node_id;
+    } else {
+      auto num_cores = std::thread::hardware_concurrency();
+      CAFFE_ENFORCE(num_cores > 0, "Failed to get number of CPU cores");
+      LOG(INFO) << "Using estimated CPU pool size: " << num_cores
+                << "; NUMA node id: " << numa_node_id;
+      pool_size = num_cores;
+    }
+  } else {
+    LOG(INFO) << "Using specified CPU pool size: " << pool_size
+              << "; NUMA node id: " << numa_node_id;
+  }
+
+  if (create_new) {
+    LOG(INFO) << "Created new CPU pool, size: " << pool_size
+              << "; NUMA node id: " << numa_node_id;
+    return std::make_shared<TaskThreadPoolImpl>(pool_size, numa_node_id);
+  } else {
+    std::lock_guard<std::mutex> lock(pool_mutex);
+
+    auto shared_pool = pools[numa_node_id][pool_size].lock();
+    if (!shared_pool) {
+      LOG(INFO) << "Created shared CPU pool, size: " << pool_size
+                << "; NUMA node id: " << numa_node_id;
+      shared_pool =
+          std::make_shared<TaskThreadPoolImpl>(pool_size, numa_node_id);
+      pools[numa_node_id][pool_size] = shared_pool;
+    }
+    return shared_pool;
+  }
+}
 
 } // namespace caffe2
 
diff --git a/caffe2/core/net_async_gpu_thread_pool.h b/caffe2/core/net_async_gpu_thread_pool.h
index 29ed24cbb6ee01..eba8cd919ef282 100644
--- a/caffe2/core/net_async_gpu_thread_pool.h
+++ b/caffe2/core/net_async_gpu_thread_pool.h
@@ -5,7 +5,7 @@
 
 namespace caffe2 {
 
-std::shared_ptr<TaskThreadPool>
+std::shared_ptr<TaskThreadPoolBase>
 GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new);
 
 } // namespace caffe2
diff --git a/caffe2/core/net_async_gpu_thread_pool_gpu.cc b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
index 80d1ff47cb7ba5..d474c41d051d77 100644
--- a/caffe2/core/net_async_gpu_thread_pool_gpu.cc
+++ b/caffe2/core/net_async_gpu_thread_pool_gpu.cc
@@ -8,7 +8,7 @@ namespace caffe2 {
 
 C10_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool);
 
-std::shared_ptr<TaskThreadPool>
+std::shared_ptr<TaskThreadPoolBase>
 GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new) {
   // For GPU, use per device thread pools of predefined constant size
   if (pool_size != c10::FLAGS_caffe2_threads_per_gpu) {
diff --git a/caffe2/core/net_dag_utils.cc b/caffe2/core/net_dag_utils.cc
index 94a70478c4aff3..8786ac6b7b10db 100644
--- a/caffe2/core/net_dag_utils.cc
+++ b/caffe2/core/net_dag_utils.cc
@@ -278,6 +278,88 @@ ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
   return chains;
 }
 
+// Here chains are essentially groups, we used chain/group interchangeably
+ExecutionChains computeGroups(std::vector<OperatorNode>& orig_nodes) {
+  const std::vector<OpGraphNode> nodes = pruneOpNodeGraph(orig_nodes);
+  ExecutionChains chains;
+  std::vector<int> sync_frontier;
+  std::vector<int> async_frontier;
+
+  std::vector<int> in_degrees;
+  in_degrees.reserve(nodes.size());
+  std::transform(
+      nodes.begin(),
+      nodes.end(),
+      std::back_inserter(in_degrees),
+      [](const OpGraphNode& n) { return n.parents_.size(); });
+
+  // Screen out the primary root nodes
+  for (int idx = 0; idx < (int)nodes.size(); ++idx) {
+    if (in_degrees[idx] == 0) {
+      if (orig_nodes[idx].operator_->HasAsyncPart()) {
+        async_frontier.push_back(idx);
+      } else {
+        sync_frontier.push_back(idx);
+      }
+    }
+  }
+
+  // We check sync ops on the froniter first and then async ops. This gives us a
+  // head start to execute sync ops locally while waiting for async ops to
+  // finish.
+  std::queue<int> q;
+  while (!(async_frontier.empty() && sync_frontier.empty())) {
+    // Sync ops
+    for (const auto i : sync_frontier) {
+      q.push(i);
+    }
+    sync_frontier.clear();
+    std::vector<int> chain;
+    while (!q.empty()) {
+      int idx = q.front();
+      q.pop();
+      chain.push_back(idx);
+      for (int child : nodes[idx].children_) {
+        if (--in_degrees[child] == 0) {
+          if (orig_nodes[child].operator_->HasAsyncPart()) {
+            async_frontier.push_back(child);
+          } else {
+            q.push(child);
+          }
+        }
+      }
+    }
+    // add the whole group of continuous sync ops into one chain
+    if (!chain.empty()) {
+      chains.emplace(chain.front(), chain);
+    }
+
+    // Async ops
+    for (const auto i : async_frontier) {
+      q.push(i);
+    }
+    async_frontier.clear();
+    while (!q.empty()) {
+      int idx = q.front();
+      q.pop();
+      // Put each individual node as a new chain
+      chains[idx] = {idx};
+      for (int child : nodes[idx].children_) {
+        if (--in_degrees[child] == 0) {
+          if (orig_nodes[child].operator_->HasAsyncPart()) {
+            q.push(child);
+          } else {
+            sync_frontier.push_back(child);
+          }
+        }
+      }
+    }
+  }
+
+  updateOperatorNodes(orig_nodes, chains);
+  return chains;
+}
+
 ExecutionChains singleChains(std::vector<OperatorNode>& nodes) {
   ExecutionChains chains;
   for (int i = 0; i < (int)nodes.size(); ++i) {
diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h
index 0259f10f954652..9b605a9cfd67c1 100644
--- a/caffe2/core/net_dag_utils.h
+++ b/caffe2/core/net_dag_utils.h
@@ -43,11 +43,19 @@ struct OpGraphNode {
 
 using ExecutionChains = std::unordered_map<int, std::vector<int>>;
 
-ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
+C10_EXPORT ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes);
 
-ExecutionChains singleChains(std::vector<OperatorNode>& nodes);
+// Instead of breaking down the DAG into chains, we partition it into clusters
+// of sync ops and individual async op. This is useful for disturbuted inference
+// case where we have sync and async cpu ops. Note that we have go sync each
+// aysnc op instead of put them into the chain and sync its tail like GPU op,
+// because CPU async ops are typically rpc calls and are not guaranteed to be
+// linearized at remote site.
+C10_EXPORT ExecutionChains computeGroups(std::vector<OperatorNode>& orig_nodes);
 
-std::vector<OperatorNode> prepareOperatorNodes(
+C10_EXPORT ExecutionChains singleChains(std::vector<OperatorNode>& nodes);
+
+C10_EXPORT std::vector<OperatorNode> prepareOperatorNodes(
     const std::shared_ptr<const NetDef>& net_def,
     Workspace* ws);
 
diff --git a/caffe2/core/net_dag_utils_test.cc b/caffe2/core/net_dag_utils_test.cc
new file mode 100644
index 00000000000000..10d76fecf2fa8f
--- /dev/null
+++ b/caffe2/core/net_dag_utils_test.cc
@@ -0,0 +1,296 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/net_dag_utils.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+namespace {
+class DummySyncOp final : public Operator<CPUContext> {
+ public:
+  DummySyncOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+};
+
+class DummyAsyncOp final : public Operator<CPUContext> {
+ public:
+  DummyAsyncOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return true;
+  }
+
+  bool HasAsyncPart() const override {
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(DagUtilTestDummySync, DummySyncOp);
+REGISTER_CPU_OPERATOR(DagUtilTestDummyAsync, DummyAsyncOp);
+
+OPERATOR_SCHEMA(DagUtilTestDummySync)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX);
+OPERATOR_SCHEMA(DagUtilTestDummyAsync)
+    .NumInputs(0, INT_MAX)
+    .NumOutputs(0, INT_MAX);
+
+class DagUtilTestContext {
+ public:
+  DagUtilTestContext(const std::string& spec, Workspace* ws) {
+    net_def_ = std::make_shared<NetDef>();
+    CAFFE_ENFORCE(TextFormat::ParseFromString(spec, net_def_.get()));
+    operator_nodes_ = dag_utils::prepareOperatorNodes(net_def_, ws);
+  }
+
+  dag_utils::ExecutionChains computeChains() {
+    return dag_utils::computeGroups(operator_nodes_);
+  }
+
+ private:
+  std::shared_ptr<NetDef> net_def_{nullptr};
+  std::vector<dag_utils::OperatorNode> operator_nodes_;
+};
+
+void PrintChains(const dag_utils::ExecutionChains& chains) {
+  for (const auto kv : chains) {
+    std::stringstream ss;
+    ss << kv.first << ": ";
+    for (const auto& v : kv.second) {
+      ss << v << ", ";
+    }
+    LOG(INFO) << ss.str();
+  }
+}
+} // namespace
+
+TEST(DagUtilTest, Empty) {
+  const auto spec = R"DOC(
+    name: "test0"
+    type: "async_scheduling"
+    )DOC";
+  Workspace ws;
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  EXPECT_TRUE(chains.empty());
+}
+
+// 4 sync ops forming a diamond
+TEST(DagUtilTest, AllSync) {
+  const auto spec = R"DOC(
+    name: "test1"
+    type: "async_scheduling"
+    external_input: "in"
+    op {
+      input: "in"
+      output: "n1"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      output: "n2"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      output: "n3"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n2"
+      input: "n3"
+      output: "out"
+      type: "DagUtilTestDummySync"
+    }
+    )DOC";
+  Workspace ws;
+  ws.CreateBlob("in");
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  dag_utils::ExecutionChains expected{{0, {0, 1, 2, 3}}};
+  EXPECT_EQ(chains, expected);
+}
+
+// 3 async ops forming an L shape
+TEST(DagUtilTest, AllAsync) {
+  const auto spec = R"DOC(
+    name: "test2"
+    type: "async_scheduling"
+    external_input: "in0"
+    external_input: "in1"
+    op {
+      input: "in0"
+      output: "n1"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "in1"
+      output: "n2"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n1"
+      output: "n3"
+      type: "DagUtilTestDummyAsync"
+    }
+    )DOC";
+  Workspace ws;
+  ws.CreateBlob("in0");
+  ws.CreateBlob("in1");
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  dag_utils::ExecutionChains expected{{0, {0}}, {1, {1}}, {2, {2}}};
+  EXPECT_EQ(chains, expected);
+}
+
+// 3 sync ops and 1 async op (#2) forming a diamond
+TEST(DagUtilTest, Mixed0) {
+  const auto spec = R"DOC(
+    name: "test3"
+    type: "async_scheduling"
+    external_input: "in"
+    op {
+      input: "in"
+      output: "n1"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      output: "n2"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      output: "n3"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n2"
+      input: "n3"
+      output: "out"
+      type: "DagUtilTestDummySync"
+    }
+    )DOC";
+  Workspace ws;
+  ws.CreateBlob("in");
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  dag_utils::ExecutionChains expected{{0, {0, 1}}, {2, {2}}, {3, {3}}};
+  EXPECT_EQ(chains, expected);
+}
+
+// 3 sync ops and 1 async op (#2) forming a Y shape
+TEST(DagUtilTest, Mixed1) {
+  const auto spec = R"DOC(
+    name: "test3"
+    type: "async_scheduling"
+    external_input: "in0"
+    external_input: "in1"
+    op {
+      input: "in0"
+      output: "n1"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "in1"
+      output: "n2"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      input: "n2"
+      output: "n3"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n3"
+      output: "out"
+      type: "DagUtilTestDummySync"
+    }
+    )DOC";
+  Workspace ws;
+  ws.CreateBlob("in0");
+  ws.CreateBlob("in1");
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  dag_utils::ExecutionChains expected{{0, {0, 1}}, {2, {2}}, {3, {3}}};
+  EXPECT_EQ(chains, expected);
+}
+// More complicated mixed case. * means async
+//  0* -> 1* -> 2
+//    |
+//  3 -> 4 -> 5
+//  |  |
+//  |    6
+//   - -> 8*
+//  7* -/
+TEST(DagUtilTest, Mixed2) {
+  const auto spec = R"DOC(
+    name: "test4"
+    type: "async_scheduling"
+    external_input: "in0"
+    external_input: "in1"
+    external_input: "in2"
+    op {
+      input: "in0"
+      output: "n1"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n1"
+      output: "n2"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n2"
+      output: "out0"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "in1"
+      output: "n3"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n1"
+      input: "n3"
+      output: "n4"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n4"
+      output: "out1"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "n3"
+      output: "out2"
+      type: "DagUtilTestDummySync"
+    }
+    op {
+      input: "in2"
+      output: "n7"
+      type: "DagUtilTestDummyAsync"
+    }
+    op {
+      input: "n3"
+      input: "n7"
+      output: "out3"
+      type: "DagUtilTestDummyAsync"
+    }
+    )DOC";
+  Workspace ws;
+  ws.CreateBlob("in0");
+  ws.CreateBlob("in1");
+  ws.CreateBlob("in2");
+  DagUtilTestContext t(spec, &ws);
+  auto chains = t.computeChains();
+  dag_utils::ExecutionChains expected{
+      {0, {0}}, {1, {1}}, {3, {3, 6}}, {4, {4, 2, 5}}, {7, {7}}, {8, {8}}};
+  EXPECT_EQ(chains, expected);
+}
+} // namespace caffe2
diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
index 28f33a43cbff2f..d3ba0b2b38e14a 100644
--- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc
+++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
@@ -96,8 +96,8 @@ static std::unordered_set<repr::NNGraph::NodeRef> getTrackedNodes(
     repr::NNCFGraph& cf) {
   std::unordered_set<repr::NNGraph::NodeRef> cfTrackedNodes;
   for (const auto& bbNode : cf.getMutableNodes()) {
-    auto bb = repr::nn::get<repr::BasicBlockType<repr::NNGraph>>(bbNode);
-    for (const auto node : bb->getInstructions()) {
+    auto& bb = bbNode->data();
+    for (const auto node : bb.getInstructions()) {
       cfTrackedNodes.insert(node);
     }
   }
@@ -108,7 +108,7 @@ static size_t coalesceInsertedDataDependenciesHelper(repr::NNModule* m) {
   auto cfTrackedNodes = getTrackedNodes(m->controlFlow);
 
   for (auto& bbNode : m->controlFlow.getMutableNodes()) {
-    auto bb = repr::nn::get<repr::BasicBlockType<repr::NNGraph>>(bbNode);
+    auto bb = bbNode->mutableData();
     // We mutate the instructions of the bb, so we copy here.
     // TODO make this an iterator and simply promote it on insertion.
     auto instrsCopy = bb->getInstructions();
@@ -148,13 +148,12 @@ void coalesceInsertedDataDependencies(repr::NNModule* m) {
     }
   }
 
-  auto newBbNode = m->controlFlow.createNode(
-      util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+  auto newBbNode = m->controlFlow.createAnonymousFunction();
   auto sccs = algorithm::tarjans(&m->dataFlow);
   for (auto iter = sccs.rbegin(); iter != sccs.rend(); ++iter) {
     for (auto node : iter->getNodes()) {
       if (dfNodes.count(node)) {
-        auto currentBasicBlock = newBbNode->mutableData()->get();
+        auto currentBasicBlock = newBbNode->mutableData();
         currentBasicBlock->pushInstructionNode(node);
       }
     }
@@ -162,12 +161,12 @@ void coalesceInsertedDataDependencies(repr::NNModule* m) {
 
   // Finally we reconcile any data dependency issues (if we can).
   for (auto& bbNode : m->controlFlow.getMutableNodes()) {
-    auto bb = bbNode->mutableData()->get();
+    auto bb = bbNode->mutableData();
     std::unordered_set<repr::NNGraph::NodeRef> seen;
-    for (auto instr_iter = bb->getInstructions().begin();
-         instr_iter != bb->getInstructions().end();
+    for (auto instr_iter = bb->getMutableInstructions()->begin();
+         instr_iter != bb->getMutableInstructions()->end();
          ++instr_iter) {
-      // This cannot be auto&, TODO figure out why
+      // This cannot be auto& because *iter is pure R-ref
       auto instr = *instr_iter;
       for (auto& output : getOutputs(instr)) {
         for (auto& consumer : getConsumers(output)) {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
index 13af7c85966a48..e45d97fee769c6 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Algorithms.h
@@ -181,6 +181,20 @@ dominanceFrontierMap(G* g, typename G::NodeRef source = nullptr) {
   return domFrontierMap;
 }
 
+/// \brief Induces edges on a subgraph by connecting all nodes
+/// that are connected in the original graph.
+template <typename SubgraphType>
+void induceEdges(SubgraphType* sg) {
+  for (auto& node : sg->getNodes()) {
+    // We can scan only the inEdges
+    for (auto& inEdge : node->getInEdges()) {
+      if (sg->hasNode(inEdge->tail())) {
+        sg->addEdge(inEdge);
+      }
+    }
+  }
+}
+
 } // namespace algorithm
 } // namespace nom
 
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index 568d46a61ff561..b12e57d7516987 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -244,6 +244,104 @@ class Graph {
     return createNodeInternal(Node<T, U...>());
   }
 
+  // Note:
+  // The move functions below are unsafe.  Use them with caution
+  // and be sure to call isValid() after each use.
+
+  // Move a node from this graph to the destGraph
+  void moveNode(NodeRef node, Graph<T, U...>* destGraph) {
+    assert(hasNode(node));
+    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+      if (&(*it) == node) {
+        std::list<Node<T, U...>>& destNodes = destGraph->nodes_;
+        destNodes.splice(destNodes.end(), nodes_, it);
+        nodeRefs_.erase(node);
+        destGraph->nodeRefs_.insert(node);
+        break;
+      }
+    }
+  }
+
+  // Move an edge from this graph to the destGraph
+  void moveEdge(EdgeRef edge, Graph<T, U...>* destGraph) {
+    assert(hasEdge(edge));
+    assert(destGraph->hasNode(edge->tail()));
+    assert(destGraph->hasNode(edge->head()));
+    std::list<Edge<T, U...>>& destEdges = destGraph->edges_;
+    for (auto it = edges_.begin(); it != edges_.end(); ++it) {
+      if (&(*it) == edge) {
+        destEdges.splice(destEdges.end(), edges_, it);
+        break;
+      }
+    }
+  }
+
+  // Move entire subgraph to destGraph.
+  // Be sure to delete in/out edges from this graph first.
+  void moveSubgraph(
+      const Subgraph<T, U...>& subgraph,
+      Graph<T, U...>* destGraph) {
+    auto sg = subgraph; // Copy to check that all nodes and edges are matched
+    std::list<Edge<T, U...>>& destEdges = destGraph->edges_;
+    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+      auto node = &(*it);
+      if (sg.hasNode(node)) {
+        std::list<Node<T, U...>>& destNodes = destGraph->nodes_;
+        destNodes.splice(destNodes.end(), nodes_, it--);
+        nodeRefs_.erase(node);
+        destGraph->nodeRefs_.insert(node);
+        sg.removeNode(node);
+      }
+    }
+    for (auto it = edges_.begin(); it != edges_.end(); ++it) {
+      auto edge = &(*it);
+      if (sg.hasEdge(edge)) {
+        assert(destGraph->hasNode(edge->tail()));
+        assert(destGraph->hasNode(edge->head()));
+        destEdges.splice(destEdges.end(), edges_, it--);
+        sg.removeEdge(edge);
+      }
+    }
+    assert(sg.getNodes().size() == 0);
+    assert(sg.getEdges().size() == 0);
+  }
+
+  // Validates the graph.  Returns true if the graph is valid
+  // and false if any node or edge referenced in the graph
+  // is not actually present in the graph.
+  bool isValid() {
+    for (auto& node : getMutableNodes()) {
+      for (auto& inEdge : node->getInEdges()) {
+        if (!hasEdge(inEdge)) {
+          DEBUG_PRINT("Invalid inEdge %p on node %p\n", inEdge, node);
+          return false;
+        }
+      }
+      for (auto& outEdge : node->getOutEdges()) {
+        if (!hasEdge(outEdge)) {
+          DEBUG_PRINT("invalid outEdge %p on node %p\n", outEdge, node);
+          return false;
+        }
+      }
+      // Check validity of nodeRefs_
+      if (!hasNode(node)) {
+        DEBUG_PRINT("Invalid node %p\n", node);
+        return false;
+      }
+    }
+    for (auto& edge : getMutableEdges()) {
+      if (!hasNode(edge->tail())) {
+        DEBUG_PRINT("Invalid tail on edge %p\n", edge);
+        return false;
+      }
+      if (!hasNode(edge->head())) {
+        DEBUG_PRINT("Invalid head on edge %p\n", edge);
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Swap two nodes.
   // Any edge V -> N1 becomes V -> N2, and N1 -> V becomes N2 -> V.
   void swapNodes(NodeRef n1, NodeRef n2) {
@@ -334,6 +432,15 @@ class Graph {
     return getEdgeIfExists(tail, head);
   }
 
+  bool hasEdge(EdgeRef e) const {
+    for (auto& edge : edges_) {
+      if (e == &edge) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   /// \brief Get a reference to the edge between two nodes if it exists.
   /// note: will fail assertion if the edge does not exist.
   EdgeRef getEdge(NodeRef tail, NodeRef head) const {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
index 8866fd5758d1d7..d7e77faf8a66d4 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/common.h"
 #include "nomnigraph/Graph/Graph.h"
 #include "nomnigraph/Representations/Compiler.h"
+#include "nomnigraph/Support/Pointer.h"
 
 #include <unordered_map>
 
@@ -46,9 +47,12 @@ class BasicBlock {
     instructions_.emplace_back(node);
     trackNode(node);
   }
-  const std::vector<NodeRef>& getInstructions() {
+  const std::vector<NodeRef>& getInstructions() const {
     return instructions_;
   }
+  std::vector<NodeRef>* getMutableInstructions() {
+    return &instructions_;
+  }
 
   bool hasInstruction(NodeRef instr) const {
     return nodes_.hasNode(instr);
@@ -103,10 +107,16 @@ struct ControlFlowGraphImpl {
 
 template <typename T, typename... U>
 struct ControlFlowGraphImpl<Graph<T, U...>> {
-  using type = Graph<std::unique_ptr<BasicBlock<T, U...>>, int>;
+  using type = Graph<BasicBlock<T, U...>, int>;
   using bbType = BasicBlock<T, U...>;
 };
 
+/// \brief Helper for extracting the type of BasicBlocks given
+/// a graph (probably a dataflow graph).  TODO: refactor this
+/// to come from something like Graph::NodeDataType
+template <typename G>
+using BasicBlockType = typename ControlFlowGraphImpl<G>::bbType;
+
 /// \brief Control flow graph is a graph of basic blocks that
 /// can be used as an analysis tool.
 ///
@@ -120,31 +130,35 @@ class ControlFlowGraph : public ControlFlowGraphImpl<G>::type {
   ControlFlowGraph(ControlFlowGraph&&) = default;
   ControlFlowGraph& operator=(ControlFlowGraph&&) = default;
   ~ControlFlowGraph() {}
-};
-
-/// \brief Helper for extracting the type of BasicBlocks given
-/// a graph (probably a dataflow graph).  TODO: refactor this
-/// to come from something like Graph::NodeDataType
-template <typename G>
-using BasicBlockType = typename ControlFlowGraphImpl<G>::bbType;
+  std::unordered_map<
+      std::string,
+      typename ControlFlowGraphImpl<G>::type::SubgraphType>
+      functions;
+  using BasicBlockRef = typename ControlFlowGraphImpl<G>::type::NodeRef;
+
+  // Named functions are simply basic blocks stored in labeled Subgraphs
+  BasicBlockRef createNamedFunction(std::string name) {
+    assert(name != "anonymous" && "Reserved token anonymous cannot be used");
+    auto bb = this->createNode(BasicBlockType<G>());
+    assert(functions.count(name) == 0 && "Name already in use.");
+    typename ControlFlowGraphImpl<G>::type::SubgraphType sg;
+    sg.addNode(bb);
+    functions[name] = sg;
+    return bb;
+  }
 
-/// \brief Converts graph to SSA representation.  Modifies the graph
-/// by inserting versions and phi nodes.
-template <typename Phi, typename G>
-void addSSA(G* dfg, ControlFlowGraph<G>* cfg) {
-  static_assert(
-      std::is_base_of<Instruction, Phi>::value,
-      "Phi type must be derived from Instruction.");
-  auto dfMap = dominanceFrontierMap(cfg);
-  for (auto pair : dfMap) {
-    for (auto n : pair.second) {
-      printf(
-          "%llu -> %llu\n",
-          (unsigned long long)pair.first,
-          (unsigned long long)n);
+  // Anonymous functions are aggregated into a single Subgraph
+  BasicBlockRef createAnonymousFunction() {
+    if (!functions.count("anonymous")) {
+      functions["anonymous"] =
+          typename ControlFlowGraphImpl<G>::type::SubgraphType();
     }
+
+    auto bb = this->createNode(BasicBlockType<G>());
+    functions["anonymous"].addNode(bb);
+    return bb;
   }
-}
+};
 
 /// \brief Deletes a referenced node from the control flow graph.
 template <typename G>
diff --git a/caffe2/core/nomnigraph/tests/AlgorithmsTest.cc b/caffe2/core/nomnigraph/tests/AlgorithmsTest.cc
index 68e0d18053f623..e0de91bb2ade32 100644
--- a/caffe2/core/nomnigraph/tests/AlgorithmsTest.cc
+++ b/caffe2/core/nomnigraph/tests/AlgorithmsTest.cc
@@ -96,3 +96,31 @@ TEST(DominatorTree, Test2) {
   }
 }
 
+TEST(Subgraph, InduceEdges) {
+  auto g = createGraph();
+  auto sg = decltype(g)::SubgraphType();
+  for (const auto& node : g.getMutableNodes()) {
+    sg.addNode(node);
+  }
+
+  nom::algorithm::induceEdges(&sg);
+
+  for (const auto& edge : g.getMutableEdges()) {
+    EXPECT_TRUE(sg.hasEdge(edge));
+  }
+}
+
+TEST(Subgraph, InduceEdgesCycle) {
+  auto g = createGraphWithCycle();
+  auto sg = decltype(g)::SubgraphType();
+  for (const auto& node : g.getMutableNodes()) {
+    sg.addNode(node);
+  }
+
+  nom::algorithm::induceEdges(&sg);
+
+  for (const auto& edge : g.getMutableEdges()) {
+    EXPECT_TRUE(sg.hasEdge(edge));
+  }
+}
+
diff --git a/caffe2/core/nomnigraph/tests/GraphTest.cc b/caffe2/core/nomnigraph/tests/GraphTest.cc
index 0138a9b5759d32..a0d61e19131f1a 100644
--- a/caffe2/core/nomnigraph/tests/GraphTest.cc
+++ b/caffe2/core/nomnigraph/tests/GraphTest.cc
@@ -117,4 +117,65 @@ TEST(Basic, HasNode) {
   // Current graph: 3 -> 4  ,   2
   // replaceNode doesn't delete n2.
   EXPECT_TRUE(g.hasNode(n2));
+
+  // Create a second graph g2, and move the nodes from g2 to g.
+  TestClass t5;
+  nom::Graph<TestClass> g2;
+  nom::Graph<TestClass>::NodeRef n5 = g2.createNode(std::move(t5));
+  EXPECT_TRUE(g2.hasNode(n5));
+
+  EXPECT_FALSE(g.hasNode(n5));
+  g2.moveNode(n5, &g);
+  // Current graph (g1): 3 -> 4, 2, 5
+  EXPECT_TRUE(g.hasNode(n5));
+}
+
+TEST(Basic, Moves) {
+  TestGraph g;
+  auto n1 = createTestNode(g);
+  auto n2 = createTestNode(g);
+  auto n3 = createTestNode(g);
+  auto e1 = g.createEdge(n1, n2);
+  auto e2 = g.createEdge(n1, n3);
+  // Current graph: 1 -> 2 -> 3
+
+  TestGraph g2;
+  g.deleteEdge(e2);
+  g.moveNode(n1, &g2);
+  g.moveNode(n2, &g2);
+  g.moveEdge(e1, &g2);
+  EXPECT_TRUE(g.isValid());
+  EXPECT_TRUE(g2.isValid());
+  EXPECT_EQ(g.getMutableNodes().size(), 1);
+  EXPECT_EQ(g2.getMutableNodes().size(), 2);
+  EXPECT_EQ(g.getMutableEdges().size(), 0);
+  EXPECT_EQ(g2.getMutableEdges().size(), 1);
+}
+
+TEST(Basic, MoveSubgraph) {
+  TestGraph g;
+  auto n1 = createTestNode(g);
+  auto n2 = createTestNode(g);
+  auto n3 = createTestNode(g);
+  auto e1 = g.createEdge(n1, n2);
+  auto e2 = g.createEdge(n1, n3);
+  // Current graph: 1 -> 2 -> 3
+
+  TestGraph g2;
+
+  g.deleteEdge(e2);
+
+  TestGraph::SubgraphType sg;
+  sg.addNode(n1);
+  sg.addNode(n2);
+  sg.addEdge(e1);
+
+  g.moveSubgraph(sg, &g2);
+
+  EXPECT_TRUE(g.isValid());
+  EXPECT_TRUE(g2.isValid());
+  EXPECT_EQ(g.getMutableNodes().size(), 1);
+  EXPECT_EQ(g2.getMutableNodes().size(), 2);
+  EXPECT_EQ(g.getMutableEdges().size(), 0);
+  EXPECT_EQ(g2.getMutableEdges().size(), 1);
 }
diff --git a/caffe2/core/nomnigraph/tests/test_util.cc b/caffe2/core/nomnigraph/tests/test_util.cc
index 4ee423d0e0571a..9bf6e66f410bf2 100644
--- a/caffe2/core/nomnigraph/tests/test_util.cc
+++ b/caffe2/core/nomnigraph/tests/test_util.cc
@@ -63,11 +63,9 @@ nom::Graph<std::string> createGraphWithCycle() {
 
 std::map<std::string, std::string> BBPrinter(typename nom::repr::NNCFGraph::NodeRef node) {
   std::map<std::string, std::string> labelMap;
-  assert(node->data() && "Node doesn't have data, can't render it");
-  auto *bb = dyn_cast<nom::repr::BasicBlockType<nom::repr::NNGraph>>(
-      node->data().get());
+  auto& bb = node->data();
   labelMap["label"] = to_string((unsigned long long)node) + "\\n";
-  for (const auto &instr : bb->getInstructions()) {
+  for (const auto& instr : bb.getInstructions()) {
     assert(isa<nom::repr::NeuralNetOperator>(instr->data()) &&
            "Invalid instruction.");
     auto *op = dyn_cast<nom::repr::NeuralNetOperator>(instr->data().get());
diff --git a/caffe2/operators/stats_put_ops.cc b/caffe2/operators/stats_put_ops.cc
index 40c6b8cc60d085..5231d83d442500 100644
--- a/caffe2/operators/stats_put_ops.cc
+++ b/caffe2/operators/stats_put_ops.cc
@@ -25,6 +25,9 @@ OPERATOR_SCHEMA(AveragePut)
     .Arg(
         "magnitude_expand",
         "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .Arg(
+        "bound",
+        "(*boolean*): whether or not to clamp inputs to the max inputs allowed")
     .SetDoc(R"DOC(
     Consume a value and pushes it to the global stat registry as an average.
 
@@ -51,6 +54,9 @@ OPERATOR_SCHEMA(IncrementPut)
     .Arg(
         "magnitude_expand",
         "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .Arg(
+        "bound",
+        "(*boolean*): whether or not to clamp inputs to the max inputs allowed")
     .SetDoc(R"DOC(
     Consume a value and pushes it to the global stat registry as an sum.
 
@@ -77,6 +83,9 @@ OPERATOR_SCHEMA(StdDevPut)
     .Arg(
         "magnitude_expand",
         "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers")
+    .Arg(
+        "bound",
+        "(*boolean*): whether or not to clamp inputs to the max inputs allowed")
     .SetDoc(R"DOC(
       Consume a value and pushes it to the global stat registry as an standard deviation.
 
diff --git a/caffe2/operators/stats_put_ops.h b/caffe2/operators/stats_put_ops.h
index cf41bb593a36cd..6c989e263bf7ab 100644
--- a/caffe2/operators/stats_put_ops.h
+++ b/caffe2/operators/stats_put_ops.h
@@ -1,3 +1,4 @@
+#include <cmath>
 #include <limits>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/stats.h"
@@ -14,6 +15,7 @@ struct TemplatePutOp : public Operator<CPUContext> {
             "stat_name",
             operator_def.input().Get(0))),
         magnitude_expand_(GetSingleArgument<int64_t>("magnitude_expand", 1)),
+        bound_(GetSingleArgument<bool>("bound", false)),
         stat_(given_name_) {}
 
   bool RunOnDevice() override {
@@ -31,12 +33,22 @@ struct TemplatePutOp : public Operator<CPUContext> {
 
   template <typename V>
   bool DoRunWithType() {
-    auto input = *Input(0).template data<V>();
+    V input = *Input(0).template data<V>();
 
-    CAFFE_ENFORCE(
-        static_cast<int64_t>(input + 1) <
-            std::numeric_limits<int64_t>::max() / magnitude_expand_,
-        "Input value is too large for the given magnitude expansion!");
+    int64_t bound_value =
+        std::numeric_limits<int64_t>::max() / magnitude_expand_;
+
+    if (bound_) {
+      if (input < -bound_value) {
+        input = -bound_value;
+      } else if (input > bound_value) {
+        input = bound_value;
+      }
+    } else {
+      CAFFE_ENFORCE(
+          std::abs(static_cast<int64_t>(input)) <= bound_value,
+          "Input value is too large for the given magnitude expansion!");
+    }
 
     int64_t int_value = input * magnitude_expand_;
 
@@ -48,6 +60,7 @@ struct TemplatePutOp : public Operator<CPUContext> {
  private:
   const std::string given_name_;
   const int64_t magnitude_expand_;
+  const bool bound_;
   T stat_;
 };
 } // namespace caffe2
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 46fd8349b05832..07740343375c8c 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -284,8 +284,7 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) {
   /// \brief For the construction of the control flow graph we keep track
   /// of a current basic block, which we split up as we come accross control
   /// flow operations such as if and while.
-  auto bbNode =
-      cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+  auto bbNode = cfg.createNamedFunction("main");
 
   for (auto &op : *net.mutable_op()) {
     auto opNode = dfg.createNode(); // Create an empty node for the operator.
@@ -316,7 +315,7 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) {
     }
 
     opNode->resetData(convertToNeuralNetOperator(op));
-    auto currentBasicBlock = bbNode->mutableData()->get();
+    auto currentBasicBlock = bbNode->mutableData();
     currentBasicBlock->pushInstructionNode(opNode);
   }
 
@@ -445,8 +444,8 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old
     if (bbNode->getOutEdges().size() > 1) {
       CAFFE_THROW("Control flow not yet supported in Caffe2 converter.");
     }
-    auto bb = bbNode->data().get();
-    for (const auto &instrNode : bb->getInstructions()) {
+    auto bb = bbNode->data();
+    for (const auto& instrNode : bb.getInstructions()) {
       caffe2::OperatorDef op = convertToOperatorDef(instrNode);
 
       for (const auto &inEdge : instrNode->getInEdges()) {
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index 1b9b4929bb0f9b..ebf3c3b8cd44a5 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -831,7 +831,6 @@ def param_update_fun(model):
 
         return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     def test_equiv_recurrent(self):
         '''
         Test that the model produces exactly same results given
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 5d41fdfb18262b..b7ad88376fdbb5 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -194,7 +194,6 @@ def ref(x, y):
         _test_binary("Mul", ref, filter_=not_overflow, test_gradient=True)(self)
         _test_binary_broadcast("Mul", ref, filter_=not_overflow)(self)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     def test_div(self):
         def ref(x, y):
             return (x / y, )
@@ -1825,7 +1824,6 @@ def ref(data):
         out, = self.assertReferenceChecks(gc, op, [a], ref)
         self.assertEqual(dst, out.dtype)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(a=hu.tensor(),
            eps=st.floats(min_value=1e-4, max_value=1e-2),
            a_grad=hu.tensor(elements=st.floats(min_value=0.01, max_value=0.99)),
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index abe1971680a7e9..407e92b65d41e4 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -12,12 +12,24 @@
 
 
 class NNModule(object):
-    def __init__(self, net=None):
+    def __init__(self, net=None, device_map=None):
         if net is not None:
+            serialized_proto = None
             if isinstance(net, core.Net):
-                self._NNModule = C.NNModuleFromProtobuf(net.Proto().SerializeToString())
+                serialized_proto = net.Proto().SerializeToString()
             elif isinstance(net, caffe2_pb2.NetDef):
-                self._NNModule = C.NNModuleFromProtobuf(net.SerializeToString())
+                serialized_proto = net.SerializeToString()
+
+            # Distributed
+            if device_map is not None:
+                serialized_device_map = {}
+                for k in device_map:
+                    serialized_device_map[k] = device_map[k].SerializeToString()
+                self._NNModule = C.NNModuleFromProtobufDistributed(serialized_proto,
+                        serialized_device_map)
+            # Default
+            elif serialized_proto:
+                self._NNModule = C.NNModuleFromProtobuf(serialized_proto)
             else:
                 raise Exception(
                     "NNModule can be constructed with core.Net or caffe2_pb2.NetDef types"
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index b899d7df37238e..a42c31a905266b 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -4,6 +4,7 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core, workspace, test_util
+from caffe2.proto import caffe2_pb2
 import caffe2.python.nomnigraph as ng
 
 from hypothesis import given
@@ -224,7 +225,7 @@ def test_annotation_from_graph(self):
         new_annot = node.getAnnotation()
         assert new_annot.getDeviceType() == 7
 
-    def test_distribute_annotations(self):
+    def test_distributed_annotations(self):
         nn = ng.NNModule()
         key = nn.dataFlow.createNode(ng.NeuralNetData("key"))
         length = nn.dataFlow.createNode(ng.NeuralNetData("length"))
@@ -243,3 +244,12 @@ def test_distribute_annotations(self):
         assert len(new_annot.getComponentLevels()) == 3
         assert new_annot.getComponentLevels()[0] == ""
         assert new_annot.getComponentLevels()[2] == "woot"
+
+    def test_distributed_device_map(self):
+        net = core.Net("name")
+        net.FC(["X", "W"], ["Y"])
+        d = caffe2_pb2.DeviceOption()
+        nn = ng.NNModule(net, {"X": d, "W": d})
+
+        with self.assertRaises(Exception):
+            nn = ng.NNModule(net, {"X": d, "Fake": d})
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index 3e297416770ba0..8ff58a68ce1083 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -648,7 +648,6 @@ def test_bvlc_googlenet(self):
     def test_bvlc_reference_caffenet(self):
         self._test_net('bvlc_reference_caffenet')
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     def test_bvlc_reference_rcnn_ilsvrc13(self):
         self._test_net('bvlc_reference_rcnn_ilsvrc13')
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 766641ca4916dd..9ca42439918a56 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -60,10 +60,6 @@
 if 'JENKINS_URL' in os.environ:
     backend_test.exclude(r'(test_vgg19|test_vgg)')
 
-# FIXME: flaky test in CircleCI
-if "IN_CIRCLECI" in os.environ:
-    backend_test.exclude(r'(test_dynamic_slice_cpu)')
-
 # import all test cases at global scope to make them visible to python.unittest
 globals().update(backend_test
                  .enable_report()
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
index 69aead865d1c3f..7bc0440fe37ef8 100644
--- a/caffe2/python/operator_test/adagrad_test.py
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -12,6 +12,7 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 from caffe2.python.operator_test.adagrad_test_helper import (
     ref_adagrad, adagrad_sparse_test_helper
 )
@@ -162,9 +163,9 @@ def ref_sparse(param, momentum, indices, grad, lr):
                 gc, op, [param_i, momentum_i, indices, grad, lr], ref_sparse
             )
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     # Suppress filter_too_much health check.
     # Likely caused by `assume` call falling through too often.
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @settings(suppress_health_check=[HealthCheck.filter_too_much])
     @given(inputs=hu.tensors(n=2),
            lr=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index e37738801745c2..05ce3d0f94c80d 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -131,7 +131,6 @@ def collect_and_distribute_fpn_rpn_ref(*inputs):
 
 
 class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000),
                   rpn_min_level=st.integers(min_value=1, max_value=4),
                   rpn_num_levels=st.integers(min_value=1, max_value=6),
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index a3cb4e3f05749e..0ac94ca5feeff6 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -447,7 +447,6 @@ def test_3d_convolution_cudnn_nchw(self, op_type, batch_size, stride, size,
                    or "CUDNN_STATUS_NOT_SUPPORTED" not in es:
                     raise e
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
            stride=st.integers(1, 3),
            pad=st.integers(0, 3),
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
index 603b7dd4af231d..2d2d3c119d4848 100644
--- a/caffe2/python/operator_test/crf_test.py
+++ b/caffe2/python/operator_test/crf_test.py
@@ -7,8 +7,10 @@
 import numpy as np
 from scipy.misc import logsumexp
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import hypothesis.strategies as st
 from hypothesis import given
+import unittest
 
 
 class TestCRFOp(hu.HypothesisTestCase):
@@ -56,6 +58,7 @@ def test_crf_with_loss_op(self, num_tags, num_words):
             err_msg='CRF LOSS is not matching the reference'
         )
 
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @given(num_tags=st.integers(1, 4),
            num_words=st.integers(2, 4))
     def test_crf_gradient(self, num_tags, num_words):
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index f97b0c5809d5fd..9691f8ef8cbade 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -250,7 +250,6 @@ def weighted_sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
             output_to_grad='xentropy',
             grad_reference=weighted_sigmoid_xentr_logit_grad_ref)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(n=st.integers(2, 10),
            b=st.integers(1, 5),
            **hu.gcs_cpu_only)
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
index 583e1db1679f80..c659cc67516b21 100644
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -377,7 +377,6 @@ def reference_conv_op(*args):
 
     # CUDNN does NOT support different padding values and we skip it
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(stride_h=st.integers(1, 3),
            stride_w=st.integers(1, 3),
            pad_h=st.integers(0, 3),
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index a3ae3263034744..0adbd7d20b71fe 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -133,7 +133,6 @@ def sqr_op(X):
         self.assertGradientChecks(
             gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(
         X=hu.tensor(
             elements=st.floats(0.1, 10),
@@ -336,7 +335,6 @@ def sigmoid_ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(X=hu.tensor(dtype=np.float32),
            inplace=st.booleans(),
            alpha=st.floats(min_value=-100.0, max_value=100.0),
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
index d10e19e4932ade..55307a02f9a894 100644
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -7,9 +7,11 @@
 from caffe2.python import core
 from hypothesis import assume, given, settings, HealthCheck
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
+import unittest
 
 
 class TestFcOperator(serial.SerializedTestCase):
@@ -76,6 +78,7 @@ def fc_tranposed_op(X, W, b):
             self.assertGradientChecks(gc, op, [X, W, b], i, [0],
                                       threshold=threshold, stepsize=stepsize)
 
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much])
     @serial.given(n=st.integers(1, 5),
            m=st.integers(0, 5),
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index 082c0c20c46a62..9d5d0deda9acb2 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -16,7 +16,6 @@
 
 class TestGroupConvolution(hu.HypothesisTestCase):
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(stride=st.integers(1, 3),
            pad=st.integers(0, 3),
            kernel=st.integers(1, 5),
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index ed8945b7927e90..ea30c6c7579b84 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -250,7 +250,6 @@ def generate_input_state(n, d):
 class GRUCellTest(serial.SerializedTestCase):
 
     # Test just for GRUUnitOp
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @serial.given(
         seed=st.integers(0, 2**32 - 1),
         input_tensor=gru_unit_op_input(),
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 46b16f4356ff5f..c8659f19615ed2 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -7,6 +7,7 @@
 from hypothesis import assume, given
 
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import hypothesis.strategies as st
 import numpy as np
 
@@ -114,7 +115,7 @@ def test_im2col_layout(self, batch_size, stride, pad, kernel, dilation,
             atol=1e-4,
             rtol=1e-4)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @given(batch_size=st.integers(1, 3),
            stride=st.integers(1, 3),
            pad=st.integers(0, 3),
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index 1d072a230ae3a7..52054f1dd839bf 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -52,7 +52,6 @@ def _feed_inputs(self, input_blobs, device_option):
         for name, blob in zip(names, input_blobs):
             self.ws.create_blob(name).feed(blob, device_option=device_option)
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(gc=hu.gcs['gc'],
            dc=hu.gcs['dc'],
            N=st.integers(2, 3),
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 59203fd960c88b..507d3e73d660d2 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -7,6 +7,7 @@
 from caffe2.python.model_helper import ModelHelper
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
 import os
@@ -14,7 +15,7 @@
 
 
 class TestLayerNormOp(serial.SerializedTestCase):
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @serial.given(X=hu.tensors(n=1), **hu.gcs)
     def test_layer_norm_grad_op(self, X, gc, dc):
         X = X[0]
@@ -86,7 +87,7 @@ def layer_norm_grad_ref(gout_full, norm, mean_full, stdev_full, X_full):
             outputs_to_check=[0],
         )
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @given(X=hu.tensors(n=1), **hu.gcs)
     def test_layer_norm_op(self, X, gc, dc):
         X = X[0]
diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py
index 322b708daa46f1..e7bdf794b3e843 100644
--- a/caffe2/python/operator_test/rand_quantization_op_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_test.py
@@ -17,7 +17,6 @@
 np.set_printoptions(precision=6)
 
 class TestFloatToFusedRandRowwiseQuantized(hu.HypothesisTestCase):
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(X=hu.tensor(min_dim=2, max_dim=2,
                         min_value=1, max_value=17),  # only matrix is supported
            bitwidth_=st.sampled_from([1, 2, 4, 8]),
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index 26c323866df80b..ca736247e170c5 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -7,6 +7,7 @@
 from caffe2.python.model_helper import ModelHelper
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
@@ -15,7 +16,7 @@
 import unittest
 
 class RecurrentNetworkTest(serial.SerializedTestCase):
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @given(T=st.integers(1, 4),
            n=st.integers(1, 5),
            d=st.integers(1, 5))
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index a4ff61218cdb54..c8e8a32b902354 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -7,10 +7,12 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+from caffe2.python.test_util import IN_CIRCLECI_FLAKY_ENV
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 import itertools as it
+import unittest
 
 
 class TestReduceOps(serial.SerializedTestCase):
@@ -55,6 +57,7 @@ def run_reduce_op_test(
                 self.run_reduce_op_test_impl(
                     op_name, X, axes, keepdims, ref_func, gc, dc)
 
+    @unittest.skipIf(IN_CIRCLECI_FLAKY_ENV, "FIXME: flaky test in CircleCI")
     @serial.given(
         X=hu.tensor(max_dim=3, dtype=np.float32), keepdims=st.booleans(),
         num_axes=st.integers(1, 3), **hu.gcs)
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index e29c252345ab66..e9e09a54e319e2 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -188,7 +188,6 @@ def test_remove_padding(self, start_pad_width, end_pad_width, args, gc, dc):
             inputs=[data, lengths],
             reference=partial(_remove_padding_ref, start_pad_width, end_pad_width))
 
-    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @serial.given(start_pad_width=st.integers(min_value=0, max_value=2),
            end_pad_width=st.integers(min_value=0, max_value=2),
            args=_gen_test_add_padding(with_pad_data=True),
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
index d3757c3b396e50..9bf03ceeba27f6 100644
--- a/caffe2/python/operator_test/stats_put_ops_test.py
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -10,6 +10,37 @@
 
 class TestPutOps(TestCase):
 
+    def test_clamp(self):
+        put_value = 10
+        magnitude_expand = int(1e18)
+        stat_name = "stat".encode('ascii')
+        sum_postfix = "/stat_value/sum".encode("ascii")
+        count_postfix = "/stat_value/count".encode("ascii")
+
+        workspace.FeedBlob("value", np.array([put_value], dtype=np.float))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            "AveragePut",
+            "value",
+            [],
+            stat_name=stat_name,
+            magnitude_expand=magnitude_expand,
+            bound=True))
+
+        workspace.RunOperatorOnce(core.CreateOperator(
+            'StatRegistryExport', [], ['k', 'v', 't']))
+
+        k = workspace.FetchBlob('k')
+        v = workspace.FetchBlob('v')
+
+        stat_dict = dict(zip(k, v))
+
+        self.assertIn(stat_name + sum_postfix, stat_dict)
+        self.assertIn(stat_name + count_postfix, stat_dict)
+        self.assertEquals(stat_dict[stat_name + sum_postfix],
+         9 * magnitude_expand)
+        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+
     def test_avg_put_ops(self):
         put_value = 15.1111
         magnitude_expand = 10000
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index 40576bd6d9c65c..17402451120596 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -2,6 +2,7 @@
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/opt/converter.h"
+#include "caffe2/opt/distributed.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "caffe2/python/dlpack.h"
 #include "caffe2/python/pybind_state_registry.h"
@@ -99,6 +100,24 @@ void addNomnigraphMethods(pybind11::module& m) {
     return caffe2::convertToNNModule(proto);
   });
 
+  m.def(
+      "NNModuleFromProtobufDistributed",
+      [](py::bytes def, std::map<std::string, py::bytes> blobToDeviceMap) {
+        std::map<std::string, caffe2::DeviceOption> m;
+        for (const auto& el : blobToDeviceMap) {
+          caffe2::DeviceOption d;
+          CAFFE_ENFORCE(
+              ParseProtoFromLargeString(el.second.cast<std::string>(), &d));
+          m[el.first] = d;
+        }
+
+        caffe2::NetDef proto;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
+
+        return caffe2::convertToNNModule(proto, m);
+      });
+
   py::class_<NNModule> nnmodule(m, "NNModule");
   nnmodule.def(py::init<>())
       .def(
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index fd4b3ab030428d..46849ce07e44e4 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -8,7 +8,9 @@
 from caffe2.python import core, workspace
 
 import unittest
+import os
 
+IN_CIRCLECI_FLAKY_ENV = "IN_CIRCLECI" in os.environ and "py2-gcc4.8-ubuntu14.04" in os.environ.get("BUILD_ENVIRONMENT", "")
 
 def rand_array(*dims):
     # np.random.rand() returns float instead of 0-dim array, that's why need to
diff --git a/caffe2/utils/thread_pool.h b/caffe2/utils/thread_pool.h
index aa2827fbab3e29..e87217b126de24 100644
--- a/caffe2/utils/thread_pool.h
+++ b/caffe2/utils/thread_pool.h
@@ -13,7 +13,15 @@
 
 namespace caffe2 {
 
-class CAFFE2_API TaskThreadPool {
+class CAFFE2_API TaskThreadPoolBase {
+ public:
+  virtual void run(const std::function<void()>& func) = 0;
+  virtual size_t size() const = 0;
+  virtual size_t numAvailable() const = 0;
+  virtual ~TaskThreadPoolBase() {}
+};
+
+class CAFFE2_API TaskThreadPool : public TaskThreadPoolBase {
  private:
   struct task_element_t {
     bool run_with_id;
@@ -51,7 +59,7 @@ class CAFFE2_API TaskThreadPool {
   }
 
   // Set running flag to false then notify all threads.
-  ~TaskThreadPool() {
+  ~TaskThreadPool() override {
     {
       std::unique_lock<std::mutex> lock(mutex_);
       running_ = false;
@@ -66,14 +74,14 @@ class CAFFE2_API TaskThreadPool {
     }
   }
 
-  size_t size() const {
+  size_t size() const override {
     return threads_.size();
   }
 
   /**
    * The number of available (i.e. idle) threads in this thread pool.
    */
-  size_t num_available() const {
+  size_t numAvailable() const override {
     return available_;
   }
 
@@ -89,7 +97,7 @@ class CAFFE2_API TaskThreadPool {
     condition_.notify_one();
   }
 
-  void run(const std::function<void()>& func) {
+  void run(const std::function<void()>& func) override {
     runTask(func);
   }
 
diff --git a/caffe2/utils/threadpool/pthreadpool.cc b/caffe2/utils/threadpool/pthreadpool.cc
index 094e49cf49adb2..c26db116cfef85 100644
--- a/caffe2/utils/threadpool/pthreadpool.cc
+++ b/caffe2/utils/threadpool/pthreadpool.cc
@@ -69,15 +69,15 @@ void pthreadpool_compute_1d_tiled(
 struct compute_2d_context {
   pthreadpool_function_2d_t function;
   void* argument;
-  caffe2::FixedDivisor<int> range_j;
+  caffe2::FixedDivisor<int32_t> range_j;
 };
 
 static void compute_2d(const struct compute_2d_context* context, size_t linear_index) {
-  DCHECK_LE(linear_index, std::numeric_limits<int>::max());
+  DCHECK_LE(linear_index, std::numeric_limits<int32_t>::max());
 
-  int q;
-  int r;
-  context->range_j.DivMod((int)linear_index, &q, &r);
+  int32_t q;
+  int32_t r;
+  context->range_j.DivMod(static_cast<int32_t>(linear_index), &q, &r);
   context->function(context->argument, q, r);
 }
 
@@ -96,13 +96,12 @@ void pthreadpool_compute_2d(
       }
     }
   } else {
-    DCHECK_LE(range_i * range_j, (size_t) std::numeric_limits<int>::max());
+    DCHECK_LE(range_i * range_j, (size_t)std::numeric_limits<int32_t>::max());
     /* Execute in parallel on the thread pool using linearized index */
     struct compute_2d_context context = {
-      .function = function,
-      .argument = argument,
-      .range_j = caffe2::FixedDivisor<int>(range_j)
-    };
+        .function = function,
+        .argument = argument,
+        .range_j = caffe2::FixedDivisor<int32_t>(range_j)};
     pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
   }
 }
@@ -110,7 +109,7 @@ void pthreadpool_compute_2d(
 struct compute_2d_tiled_context {
   pthreadpool_function_2d_tiled_t function;
   void* argument;
-  caffe2::FixedDivisor<int> tile_range_j;
+  caffe2::FixedDivisor<int32_t> tile_range_j;
   size_t range_i;
   size_t range_j;
   size_t tile_i;
@@ -118,8 +117,8 @@ struct compute_2d_tiled_context {
 };
 
 static void compute_2d_tiled(const struct compute_2d_tiled_context* context, size_t linear_index) {
-  int q;
-  int r;
+  int32_t q;
+  int32_t r;
 
   context->tile_range_j.DivMod(linear_index, &q, &r);
   const size_t max_tile_i = context->tile_i;
@@ -151,16 +150,219 @@ void pthreadpool_compute_2d_tiled(
     /* Execute in parallel on the thread pool using linearized index */
     const size_t tile_range_i = divide_round_up(range_i, tile_i);
     const size_t tile_range_j = divide_round_up(range_j, tile_j);
-    DCHECK_LE(tile_range_i * tile_range_j, (size_t) std::numeric_limits<int>::max());
+    DCHECK_LE(
+        tile_range_i * tile_range_j,
+        (size_t)std::numeric_limits<int32_t>::max());
     struct compute_2d_tiled_context context = {
-      .function = function,
-      .argument = argument,
-      .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
-      .range_i = range_i,
-      .range_j = range_j,
-      .tile_i = tile_i,
-      .tile_j = tile_j
-    };
+        .function = function,
+        .argument = argument,
+        .tile_range_j = caffe2::FixedDivisor<int32_t>(tile_range_j),
+        .range_i = range_i,
+        .range_j = range_j,
+        .tile_i = tile_i,
+        .tile_j = tile_j};
     pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
   }
 }
+
+struct compute_3d_tiled_context {
+  pthreadpool_function_3d_tiled_t function;
+  void* argument;
+  caffe2::FixedDivisor<int32_t> tile_range_j;
+  caffe2::FixedDivisor<int32_t> tile_range_k;
+  size_t range_i;
+  size_t range_j;
+  size_t range_k;
+  size_t tile_i;
+  size_t tile_j;
+  size_t tile_k;
+};
+
+static void compute_3d_tiled(
+    const struct compute_3d_tiled_context* context,
+    size_t linear_index) {
+  int32_t tile_index_ij, tile_index_k;
+  context->tile_range_k.DivMod(
+      static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_k);
+  int32_t tile_index_i, tile_index_j;
+  context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
+  const size_t max_tile_i = context->tile_i;
+  const size_t max_tile_j = context->tile_j;
+  const size_t max_tile_k = context->tile_k;
+  const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
+  const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
+  const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
+  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
+  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
+  const size_t tile_k = min(max_tile_k, context->range_k - index_k);
+  context->function(
+      context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
+}
+
+void pthreadpool_compute_3d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_3d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k) {
+  if (threadpool == NULL) {
+    /* No thread pool provided: execute function sequentially on the calling
+     * thread */
+    for (size_t i = 0; i < range_i; i += tile_i) {
+      for (size_t j = 0; j < range_j; j += tile_j) {
+        for (size_t k = 0; k < range_k; k += tile_k) {
+          function(
+              argument,
+              i,
+              j,
+              k,
+              min(range_i - i, tile_i),
+              min(range_j - j, tile_j),
+              min(range_k - k, tile_k));
+        }
+      }
+    }
+  } else {
+    /* Execute in parallel on the thread pool using linearized index */
+    const size_t tile_range_i = divide_round_up(range_i, tile_i);
+    const size_t tile_range_j = divide_round_up(range_j, tile_j);
+    const size_t tile_range_k = divide_round_up(range_k, tile_k);
+    DCHECK_LE(
+        tile_range_i * tile_range_j * tile_range_k,
+        (size_t)std::numeric_limits<int>::max());
+    struct compute_3d_tiled_context context = {
+        .function = function,
+        .argument = argument,
+        .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
+        .tile_range_k = caffe2::FixedDivisor<int>(tile_range_k),
+        .range_i = range_i,
+        .range_j = range_j,
+        .range_k = range_k,
+        .tile_i = tile_i,
+        .tile_j = tile_j,
+        .tile_k = tile_k};
+    pthreadpool_compute_1d(
+        threadpool,
+        (pthreadpool_function_1d_t)compute_3d_tiled,
+        &context,
+        tile_range_i * tile_range_j * tile_range_k);
+  }
+}
+
+struct compute_4d_tiled_context {
+  pthreadpool_function_4d_tiled_t function;
+  void* argument;
+  caffe2::FixedDivisor<int32_t> tile_range_kl;
+  caffe2::FixedDivisor<int32_t> tile_range_j;
+  caffe2::FixedDivisor<int32_t> tile_range_l;
+  size_t range_i;
+  size_t range_j;
+  size_t range_k;
+  size_t range_l;
+  size_t tile_i;
+  size_t tile_j;
+  size_t tile_k;
+  size_t tile_l;
+};
+
+static void compute_4d_tiled(
+    const struct compute_4d_tiled_context* context,
+    size_t linear_index) {
+  int32_t tile_index_ij, tile_index_kl;
+  context->tile_range_kl.DivMod(
+      static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_kl);
+  int32_t tile_index_i, tile_index_j;
+  context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
+  int32_t tile_index_k, tile_index_l;
+  context->tile_range_l.DivMod(tile_index_kl, &tile_index_k, &tile_index_l);
+  const size_t max_tile_i = context->tile_i;
+  const size_t max_tile_j = context->tile_j;
+  const size_t max_tile_k = context->tile_k;
+  const size_t max_tile_l = context->tile_l;
+  const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
+  const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
+  const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
+  const size_t index_l = static_cast<uint32_t>(tile_index_l) * max_tile_l;
+  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
+  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
+  const size_t tile_k = min(max_tile_k, context->range_k - index_k);
+  const size_t tile_l = min(max_tile_l, context->range_l - index_l);
+  context->function(
+      context->argument,
+      index_i,
+      index_j,
+      index_k,
+      index_l,
+      tile_i,
+      tile_j,
+      tile_k,
+      tile_l);
+}
+
+void pthreadpool_compute_4d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_4d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t range_l,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k,
+    size_t tile_l) {
+  if (threadpool == NULL) {
+    /* No thread pool provided: execute function sequentially on the calling
+     * thread */
+    for (size_t i = 0; i < range_i; i += tile_i) {
+      for (size_t j = 0; j < range_j; j += tile_j) {
+        for (size_t k = 0; k < range_k; k += tile_k) {
+          for (size_t l = 0; l < range_l; l += tile_l) {
+            function(
+                argument,
+                i,
+                j,
+                k,
+                l,
+                min(range_i - i, tile_i),
+                min(range_j - j, tile_j),
+                min(range_k - k, tile_k),
+                min(range_l - l, tile_l));
+          }
+        }
+      }
+    }
+  } else {
+    /* Execute in parallel on the thread pool using linearized index */
+    const size_t tile_range_i = divide_round_up(range_i, tile_i);
+    const size_t tile_range_j = divide_round_up(range_j, tile_j);
+    const size_t tile_range_k = divide_round_up(range_k, tile_k);
+    const size_t tile_range_l = divide_round_up(range_l, tile_l);
+    DCHECK_LE(
+        tile_range_i * tile_range_j * tile_range_k * tile_range_l,
+        (size_t)std::numeric_limits<int>::max());
+    struct compute_4d_tiled_context context = {
+        .function = function,
+        .argument = argument,
+        .tile_range_kl = caffe2::FixedDivisor<int>(tile_range_k * tile_range_l),
+        .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
+        .tile_range_l = caffe2::FixedDivisor<int>(tile_range_l),
+        .range_i = range_i,
+        .range_j = range_j,
+        .range_k = range_k,
+        .range_l = range_l,
+        .tile_i = tile_i,
+        .tile_j = tile_j,
+        .tile_k = tile_k,
+        .tile_l = tile_l};
+    pthreadpool_compute_1d(
+        threadpool,
+        (pthreadpool_function_1d_t)compute_4d_tiled,
+        &context,
+        tile_range_i * tile_range_j * tile_range_k * tile_range_l);
+  }
+}
diff --git a/caffe2/utils/threadpool/pthreadpool.h b/caffe2/utils/threadpool/pthreadpool.h
index 3de21a79051413..118ca723fbfe5f 100644
--- a/caffe2/utils/threadpool/pthreadpool.h
+++ b/caffe2/utils/threadpool/pthreadpool.h
@@ -14,7 +14,24 @@ typedef void (*pthreadpool_function_1d_t)(void*, size_t);
 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
-typedef void (*pthreadpool_function_3d_t)(void*, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_3d_tiled_t)(
+    void*,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t);
+typedef void (*pthreadpool_function_4d_tiled_t)(
+    void*,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t,
+    size_t);
 
 #ifdef __cplusplus
 extern "C" {
@@ -86,6 +103,30 @@ void pthreadpool_compute_2d_tiled(
     size_t tile_i,
     size_t tile_j);
 
+void pthreadpool_compute_3d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_3d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k);
+
+void pthreadpool_compute_4d_tiled(
+    pthreadpool_t threadpool,
+    pthreadpool_function_4d_tiled_t function,
+    void* argument,
+    size_t range_i,
+    size_t range_j,
+    size_t range_k,
+    size_t range_l,
+    size_t tile_i,
+    size_t tile_j,
+    size_t tile_k,
+    size_t tile_l);
+
 /**
  * Terminates threads in the thread pool and releases associated resources.
  *
diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake
index 99b4e783a40a51..04a84bad2a8821 100644
--- a/cmake/External/nccl.cmake
+++ b/cmake/External/nccl.cmake
@@ -9,7 +9,7 @@ if (NOT __NCCL_INCLUDED)
       target_include_directories(__caffe2_nccl INTERFACE ${NCCL_INCLUDE_DIRS})
   else()
     # build directory
-    set(nccl_PREFIX ${PROJECT_SOURCE_DIR}/third_party/nccl)
+    set(nccl_PREFIX ${PROJECT_SOURCE_DIR}/third_party/nccl/nccl)
 
     # we build nccl statically, but want to link it into the caffe shared library
     # this requires position-independent code
diff --git a/setup.py b/setup.py
index 71bea1bb867b18..99ecde1c41dc00 100644
--- a/setup.py
+++ b/setup.py
@@ -848,7 +848,7 @@ def run(self):
 if USE_ROCM:
     CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
 THD_LIB = os.path.join(lib_path, 'libTHD.a')
-NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
+NCCL_LIB = os.path.join(lib_path, 'libnccl.so.2')
 C10D_LIB = os.path.join(lib_path, 'libc10d.a')
 
 # static library only
@@ -863,7 +863,7 @@ def run(self):
         CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
     if USE_ROCM:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
-    NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
+    NCCL_LIB = os.path.join(lib_path, 'libnccl.2.dylib')
 
 if IS_WINDOWS:
     CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
diff --git a/test/common_nn.py b/test/common_nn.py
index a3273f67b76381..153ef2e74b2805 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -451,8 +451,8 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
 
 # this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
 def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'):
-    input_lengths = torch.tensor(input_lengths, dtype=torch.long)
-    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
+    input_lengths = torch.as_tensor(input_lengths, dtype=torch.long)
+    target_lengths = torch.as_tensor(target_lengths, dtype=torch.long)
     dt = log_probs.dtype
     log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
     targets = targets.long()
diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index ab6c958acd55fc..8fb67e5fdd4d98 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -4,6 +4,7 @@
 import unittest
 
 import torch
+from torch import ops
 
 from model import Model, get_custom_op_library_path
 
@@ -11,19 +12,19 @@
 class TestCustomOperators(unittest.TestCase):
     def setUp(self):
         self.library_path = get_custom_op_library_path()
-        torch.ops.load_library(self.library_path)
+        ops.load_library(self.library_path)
 
     def test_custom_library_is_loaded(self):
-        self.assertIn(self.library_path, torch.ops.loaded_libraries)
+        self.assertIn(self.library_path, ops.loaded_libraries)
 
     def test_calling_custom_op(self):
-        output = torch.ops.custom.op(torch.ones(5), 2.0, 3)
+        output = ops.custom.op(torch.ones(5), 2.0, 3)
         self.assertEqual(type(output), list)
         self.assertEqual(len(output), 3)
         for tensor in output:
             self.assertTrue(tensor.allclose(torch.ones(5) * 2))
 
-        output = torch.ops.custom.op_with_defaults(torch.ones(5))
+        output = ops.custom.op_with_defaults(torch.ones(5))
         self.assertEqual(type(output), list)
         self.assertEqual(len(output), 1)
         self.assertTrue(output[0].allclose(torch.ones(5)))
diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect
index 6b15b3e7997dea..a21d867f01e779 100644
--- a/test/expect/TestBatched.test_for.expect
+++ b/test/expect/TestBatched.test_for.expect
@@ -4,19 +4,18 @@ graph(%x.1_data : Dynamic
       %y_data : Dynamic
       %y_mask : Dynamic
       %y_dims : Dynamic) {
-  %6 : int = prim::Constant[value=10]()
+  %6 : int = prim::Constant[value=1]()
   %7 : bool = prim::Constant[value=1]()
-  %x : Dynamic, %9 : Dynamic, %10 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
+  %8 : int = prim::Constant[value=10]()
+  %x : Dynamic, %10 : Dynamic, %11 : Dynamic = prim::Loop(%8, %7, %x.1_data, %x.1_mask, %x.1_dims)
     block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) {
-      %15 : int = prim::Constant[value=1]()
-      %16 : Long() = prim::NumToTensor(%15)
+      %16 : Long() = prim::NumToTensor(%6)
       %alpha : float = prim::TensorToNum(%16)
       %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha)
       %mask : Dynamic = aten::mul(%5_mask, %y_mask)
       %dims : Dynamic = aten::__or__(%5_dims, %y_dims)
-      %21 : bool = prim::Constant[value=1]()
       %data : Dynamic = aten::where(%mask, %data.1, %5_data)
-      -> (%21, %data, %mask, %dims)
+      -> (%7, %data, %mask, %dims)
     }
-  return (%x, %9, %10);
+  return (%x, %10, %11);
 }
diff --git a/test/expect/TestBatched.test_if_else.expect b/test/expect/TestBatched.test_if_else.expect
index 86475a61e3adae..2856146efc7213 100644
--- a/test/expect/TestBatched.test_if_else.expect
+++ b/test/expect/TestBatched.test_if_else.expect
@@ -4,38 +4,36 @@ graph(%a.1_data : Dynamic
       %b_data : Dynamic
       %b_mask : Dynamic
       %b_dims : Dynamic) {
-  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %9 : bool = prim::TensorToBool(%6)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Long() = prim::NumToTensor(%10)
+  %6 : int = prim::Constant[value=1]()
+  %7 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %10 : bool = prim::TensorToBool(%7)
+  %11 : Long() = prim::NumToTensor(%6)
   %alpha.1 : float = prim::TensorToNum(%11)
   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
   %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %16 : int = prim::Constant[value=1]()
-  %17 : Long() = prim::NumToTensor(%16)
-  %alpha : float = prim::TensorToNum(%17)
+  %16 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::TensorToNum(%16)
   %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %21 : bool = prim::Constant[value=1]()
   %22 : int = prim::Constant[value=1]()
-  %23 : Dynamic = aten::type_as(%7, %6)
-  %cond_mask.1 : Dynamic = aten::mul(%6, %23)
+  %23 : Dynamic = aten::type_as(%8, %7)
+  %cond_mask.1 : Dynamic = aten::mul(%7, %23)
   %25 : int = aten::dim(%cond_mask.1)
   %26 : bool = aten::eq(%25, %22)
   %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%26)
     block0() {
       %30 : int = aten::dim(%data.1)
       %31 : int = aten::sub(%30, %22)
-      %32 : bool = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%31, %32, %cond_mask.1)
-        block0(%_ : int, %35 : Dynamic) {
-          %36 : int = aten::dim(%35)
-          %data.2 : Dynamic = aten::unsqueeze(%35, %36)
-          %38 : bool = prim::Constant[value=1]()
-          -> (%38, %data.2)
+      %data.3 : Dynamic = prim::Loop(%31, %21, %cond_mask.1)
+        block0(%_ : int, %34 : Dynamic) {
+          %35 : int = aten::dim(%34)
+          %data.2 : Dynamic = aten::unsqueeze(%34, %35)
+          -> (%21, %data.2)
         }
       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
diff --git a/test/expect/TestBatched.test_if_else_with_scalar.expect b/test/expect/TestBatched.test_if_else_with_scalar.expect
index cbe4a9f05bca4b..901fc44bc9028f 100644
--- a/test/expect/TestBatched.test_if_else_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_else_with_scalar.expect
@@ -4,39 +4,37 @@ graph(%a.1_data : Dynamic
       %b_data : Dynamic
       %b_mask : Dynamic
       %b_dims : Dynamic) {
-  %6 : float = prim::Constant[value=0.1]()
-  %7 : Float() = prim::NumToTensor(%6)
-  %other : float = prim::TensorToNum(%7)
-  %9 : Dynamic = aten::gt(%a.1_data, %other)
-  %10 : bool = prim::TensorToBool(%9)
-  %11 : int = prim::Constant[value=1]()
-  %12 : Long() = prim::NumToTensor(%11)
+  %6 : int = prim::Constant[value=1]()
+  %7 : float = prim::Constant[value=0.1]()
+  %8 : Float() = prim::NumToTensor(%7)
+  %other : float = prim::TensorToNum(%8)
+  %10 : Dynamic = aten::gt(%a.1_data, %other)
+  %11 : bool = prim::TensorToBool(%10)
+  %12 : Long() = prim::NumToTensor(%6)
   %alpha.1 : float = prim::TensorToNum(%12)
   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
   %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Long() = prim::NumToTensor(%17)
-  %alpha : float = prim::TensorToNum(%18)
+  %17 : Long() = prim::NumToTensor(%6)
+  %alpha : float = prim::TensorToNum(%17)
   %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %22 : bool = prim::Constant[value=1]()
   %23 : int = prim::Constant[value=1]()
-  %24 : Dynamic = aten::type_as(%a.1_mask, %9)
-  %cond_mask.1 : Dynamic = aten::mul(%9, %24)
+  %24 : Dynamic = aten::type_as(%a.1_mask, %10)
+  %cond_mask.1 : Dynamic = aten::mul(%10, %24)
   %26 : int = aten::dim(%cond_mask.1)
   %27 : bool = aten::eq(%26, %23)
   %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%27)
     block0() {
       %31 : int = aten::dim(%data.1)
       %32 : int = aten::sub(%31, %23)
-      %33 : bool = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%32, %33, %cond_mask.1)
-        block0(%_ : int, %36 : Dynamic) {
-          %37 : int = aten::dim(%36)
-          %data.2 : Dynamic = aten::unsqueeze(%36, %37)
-          %39 : bool = prim::Constant[value=1]()
-          -> (%39, %data.2)
+      %data.3 : Dynamic = prim::Loop(%32, %22, %cond_mask.1)
+        block0(%_ : int, %35 : Dynamic) {
+          %36 : int = aten::dim(%35)
+          %data.2 : Dynamic = aten::unsqueeze(%35, %36)
+          -> (%22, %data.2)
         }
       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
diff --git a/test/expect/TestBatched.test_if_noelse.expect b/test/expect/TestBatched.test_if_noelse.expect
index 8dbed77571217f..70070998a26054 100644
--- a/test/expect/TestBatched.test_if_noelse.expect
+++ b/test/expect/TestBatched.test_if_noelse.expect
@@ -4,32 +4,31 @@ graph(%a.1_data : Dynamic
       %b_data : Dynamic
       %b_mask : Dynamic
       %b_dims : Dynamic) {
-  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %9 : bool = prim::TensorToBool(%6)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Long() = prim::NumToTensor(%10)
+  %6 : int = prim::Constant[value=1]()
+  %7 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %10 : bool = prim::TensorToBool(%7)
+  %11 : Long() = prim::NumToTensor(%6)
   %alpha : float = prim::TensorToNum(%11)
   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %16 : int = prim::Constant[value=1]()
-  %17 : Dynamic = aten::type_as(%7, %6)
-  %cond_mask.1 : Dynamic = aten::mul(%6, %17)
-  %19 : int = aten::dim(%cond_mask.1)
-  %20 : bool = aten::eq(%19, %16)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%20)
+  %16 : bool = prim::Constant[value=1]()
+  %17 : int = prim::Constant[value=1]()
+  %18 : Dynamic = aten::type_as(%8, %7)
+  %cond_mask.1 : Dynamic = aten::mul(%7, %18)
+  %20 : int = aten::dim(%cond_mask.1)
+  %21 : bool = aten::eq(%20, %17)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21)
     block0() {
-      %24 : int = aten::dim(%data.1)
-      %25 : int = aten::sub(%24, %16)
-      %26 : bool = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%25, %26, %cond_mask.1)
+      %25 : int = aten::dim(%data.1)
+      %26 : int = aten::sub(%25, %17)
+      %data.3 : Dynamic = prim::Loop(%26, %16, %cond_mask.1)
         block0(%_ : int, %29 : Dynamic) {
           %30 : int = aten::dim(%29)
           %data.2 : Dynamic = aten::unsqueeze(%29, %30)
-          %32 : bool = prim::Constant[value=1]()
-          -> (%32, %data.2)
+          -> (%16, %data.2)
         }
       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
diff --git a/test/expect/TestBatched.test_if_noelse_with_scalar.expect b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
index d8f453c9652d7e..a1e6841f02b517 100644
--- a/test/expect/TestBatched.test_if_noelse_with_scalar.expect
+++ b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
@@ -4,33 +4,32 @@ graph(%a.1_data : Dynamic
       %b_data : Dynamic
       %b_mask : Dynamic
       %b_dims : Dynamic) {
-  %6 : float = prim::Constant[value=0.1]()
-  %7 : Float() = prim::NumToTensor(%6)
-  %other : float = prim::TensorToNum(%7)
-  %9 : Dynamic = aten::gt(%a.1_data, %other)
-  %10 : bool = prim::TensorToBool(%9)
-  %11 : int = prim::Constant[value=1]()
-  %12 : Long() = prim::NumToTensor(%11)
+  %6 : int = prim::Constant[value=1]()
+  %7 : float = prim::Constant[value=0.1]()
+  %8 : Float() = prim::NumToTensor(%7)
+  %other : float = prim::TensorToNum(%8)
+  %10 : Dynamic = aten::gt(%a.1_data, %other)
+  %11 : bool = prim::TensorToBool(%10)
+  %12 : Long() = prim::NumToTensor(%6)
   %alpha : float = prim::TensorToNum(%12)
   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Dynamic = aten::type_as(%a.1_mask, %9)
-  %cond_mask.1 : Dynamic = aten::mul(%9, %18)
-  %20 : int = aten::dim(%cond_mask.1)
-  %21 : bool = aten::eq(%20, %17)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21)
+  %17 : bool = prim::Constant[value=1]()
+  %18 : int = prim::Constant[value=1]()
+  %19 : Dynamic = aten::type_as(%a.1_mask, %10)
+  %cond_mask.1 : Dynamic = aten::mul(%10, %19)
+  %21 : int = aten::dim(%cond_mask.1)
+  %22 : bool = aten::eq(%21, %18)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%22)
     block0() {
-      %25 : int = aten::dim(%data.1)
-      %26 : int = aten::sub(%25, %17)
-      %27 : bool = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%26, %27, %cond_mask.1)
+      %26 : int = aten::dim(%data.1)
+      %27 : int = aten::sub(%26, %18)
+      %data.3 : Dynamic = prim::Loop(%27, %17, %cond_mask.1)
         block0(%_ : int, %30 : Dynamic) {
           %31 : int = aten::dim(%30)
           %data.2 : Dynamic = aten::unsqueeze(%30, %31)
-          %33 : bool = prim::Constant[value=1]()
-          -> (%33, %data.2)
+          -> (%17, %data.2)
         }
       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect
index 5cd196b56f3bfb..b55f9dd6a83eec 100644
--- a/test/expect/TestBatched.test_while.expect
+++ b/test/expect/TestBatched.test_while.expect
@@ -4,20 +4,20 @@ graph(%a.1_data : Dynamic
       %b_data : Dynamic
       %b_mask : Dynamic
       %b_dims : Dynamic) {
-  %6 : int = prim::Constant[value=2147483647]()
-  %7 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %10 : bool = prim::TensorToBool(%7)
-  %11 : int = prim::Constant[value=0]()
-  %12 : Dynamic = aten::mul(%7, %8)
-  %13 : Dynamic = aten::sum(%12)
-  %14 : Dynamic = aten::gt(%13, %11)
-  %15 : bool = prim::TensorToBool(%14)
-  %16 : Dynamic, %17 : Dynamic, %18 : Dynamic, %a : Dynamic, %20 : Dynamic, %21 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+  %6 : int = prim::Constant[value=1]()
+  %7 : int = prim::Constant[value=2147483647]()
+  %8 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %9 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %10 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %11 : bool = prim::TensorToBool(%8)
+  %12 : int = prim::Constant[value=0]()
+  %13 : Dynamic = aten::mul(%8, %9)
+  %14 : Dynamic = aten::sum(%13)
+  %15 : Dynamic = aten::gt(%14, %12)
+  %16 : bool = prim::TensorToBool(%15)
+  %17 : Dynamic, %18 : Dynamic, %19 : Dynamic, %a : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%7, %16, %8, %9, %10, %a.1_data, %a.1_mask, %a.1_dims)
     block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
-      %29 : int = prim::Constant[value=1]()
-      %30 : Long() = prim::NumToTensor(%29)
+      %30 : Long() = prim::NumToTensor(%6)
       %alpha : float = prim::TensorToNum(%30)
       %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
       %mask : Dynamic = aten::mul(%6_mask, %b_mask)
@@ -26,22 +26,21 @@ graph(%a.1_data : Dynamic
       %36 : Dynamic = aten::mul(%mask, %b_mask)
       %37 : Dynamic = aten::__or__(%dims, %b_dims)
       %38 : bool = prim::TensorToBool(%35)
-      %39 : int = prim::Constant[value=1]()
-      %40 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
-      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %40)
-      %42 : int = aten::dim(%cond_mask.1)
-      %43 : bool = aten::eq(%42, %39)
-      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%43)
+      %39 : bool = prim::Constant[value=1]()
+      %40 : int = prim::Constant[value=1]()
+      %41 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
+      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %41)
+      %43 : int = aten::dim(%cond_mask.1)
+      %44 : bool = aten::eq(%43, %40)
+      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%44)
         block0() {
-          %47 : int = aten::dim(%data.1)
-          %48 : int = aten::sub(%47, %39)
-          %49 : bool = prim::Constant[value=1]()
-          %data.3 : Dynamic = prim::Loop(%48, %49, %cond_mask.1)
+          %48 : int = aten::dim(%data.1)
+          %49 : int = aten::sub(%48, %40)
+          %data.3 : Dynamic = prim::Loop(%49, %39, %cond_mask.1)
             block0(%_ : int, %52 : Dynamic) {
               %53 : int = aten::dim(%52)
               %data.2 : Dynamic = aten::unsqueeze(%52, %53)
-              %55 : bool = prim::Constant[value=1]()
-              -> (%55, %data.2)
+              -> (%39, %data.2)
             }
           %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
           %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
@@ -53,12 +52,12 @@ graph(%a.1_data : Dynamic
       %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
       %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
       %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
-      %61 : int = prim::Constant[value=0]()
-      %62 : Dynamic = aten::mul(%35, %36)
-      %63 : Dynamic = aten::sum(%62)
-      %64 : Dynamic = aten::gt(%63, %61)
-      %65 : bool = prim::TensorToBool(%64)
-      -> (%65, %35, %36, %37, %res_data, %res_mask, %res_dims)
+      %60 : int = prim::Constant[value=0]()
+      %61 : Dynamic = aten::mul(%35, %36)
+      %62 : Dynamic = aten::sum(%61)
+      %63 : Dynamic = aten::gt(%62, %60)
+      %64 : bool = prim::TensorToBool(%63)
+      -> (%64, %35, %36, %37, %res_data, %res_mask, %res_dims)
     }
-  return (%a, %20, %21);
+  return (%a, %21, %22);
 }
diff --git a/test/expect/TestJit.test_alexnet.expect b/test/expect/TestJit.test_alexnet.expect
index cd328e4bc4ebff..5b301ab1d1e7d4 100644
--- a/test/expect/TestJit.test_alexnet.expect
+++ b/test/expect/TestJit.test_alexnet.expect
@@ -21,46 +21,48 @@ graph(%0 : Double(1, 3, 224, 224)
   %20 : int[] = prim::ListConstruct(%19, %19), scope: AlexNet/Sequential[features]/Conv2d[0]
   %21 : int = prim::Constant[value=1](), scope: AlexNet/Sequential[features]/Conv2d[0]
   %22 : int[] = prim::ListConstruct(%21, %21), scope: AlexNet/Sequential[features]/Conv2d[0]
-  %23 : int = prim::Constant[value=0](), scope: AlexNet/Sequential[features]/Conv2d[0]
-  %24 : int[] = prim::ListConstruct(%23, %23), scope: AlexNet/Sequential[features]/Conv2d[0]
-  %25 : Double(1, 64, 55, 55) = aten::_convolution(%0, %1, %2, %18, %20, %22, %23, %24, %21, %23, %23, %21), scope: AlexNet/Sequential[features]/Conv2d[0]
-  %26 : Double(1, 64, 55, 55) = aten::threshold(%25, %23, %23), scope: AlexNet/Sequential[features]/ReLU[1]
-  %27 : int = prim::Constant[value=3](), scope: AlexNet/Sequential[features]/MaxPool2d[2]
-  %28 : int[] = prim::ListConstruct(%27, %27), scope: AlexNet/Sequential[features]/MaxPool2d[2]
-  %29 : Double(1, 64, 27, 27), %30 : Long(1, 64, 27, 27) = aten::max_pool2d_with_indices(%26, %28, %20, %24, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[2]
-  %31 : Double(1, 192, 27, 27) = aten::_convolution(%29, %3, %4, %22, %20, %22, %23, %24, %21, %23, %23, %21), scope: AlexNet/Sequential[features]/Conv2d[3]
-  %32 : Double(1, 192, 27, 27) = aten::threshold(%31, %23, %23), scope: AlexNet/Sequential[features]/ReLU[4]
-  %33 : Double(1, 192, 13, 13), %34 : Long(1, 192, 13, 13) = aten::max_pool2d_with_indices(%32, %28, %20, %24, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[5]
-  %35 : Double(1, 384, 13, 13) = aten::_convolution(%33, %5, %6, %22, %22, %22, %23, %24, %21, %23, %23, %21), scope: AlexNet/Sequential[features]/Conv2d[6]
-  %36 : Double(1, 384, 13, 13) = aten::threshold(%35, %23, %23), scope: AlexNet/Sequential[features]/ReLU[7]
-  %37 : Double(1, 256, 13, 13) = aten::_convolution(%36, %7, %8, %22, %22, %22, %23, %24, %21, %23, %23, %21), scope: AlexNet/Sequential[features]/Conv2d[8]
-  %38 : Double(1, 256, 13, 13) = aten::threshold(%37, %23, %23), scope: AlexNet/Sequential[features]/ReLU[9]
-  %39 : Double(1, 256, 13, 13) = aten::_convolution(%38, %9, %10, %22, %22, %22, %23, %24, %21, %23, %23, %21), scope: AlexNet/Sequential[features]/Conv2d[10]
-  %40 : Double(1, 256, 13, 13) = aten::threshold(%39, %23, %23), scope: AlexNet/Sequential[features]/ReLU[11]
-  %41 : Double(1, 256, 6, 6), %42 : Long(1, 256, 6, 6) = aten::max_pool2d_with_indices(%40, %28, %20, %24, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[12]
-  %43 : int = aten::size(%41, %23), scope: AlexNet
-  %44 : Long() = prim::NumToTensor(%43), scope: AlexNet
-  %45 : int = prim::TensorToNum(%44), scope: AlexNet
-  %46 : int = prim::Constant[value=9216](), scope: AlexNet
-  %47 : int[] = prim::ListConstruct(%45, %46), scope: AlexNet
-  %48 : Double(1, 9216) = aten::view(%41, %47), scope: AlexNet
-  %49 : float = prim::Constant[value=0.5](), scope: AlexNet/Sequential[classifier]/Dropout[0]
-  %50 : Double(1!, 9216) = aten::dropout(%48, %49, %21), scope: AlexNet/Sequential[classifier]/Dropout[0]
-  %51 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %52 : int = prim::Constant[value=4096](), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %53 : int[] = prim::ListConstruct(%21, %52), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %54 : Double(1, 4096) = aten::expand(%12, %53, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %55 : Double(1, 4096) = aten::addmm(%54, %50, %51, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %56 : Double(1, 4096) = aten::threshold(%55, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[2]
-  %57 : Double(1!, 4096) = aten::dropout(%56, %49, %21), scope: AlexNet/Sequential[classifier]/Dropout[3]
-  %58 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %59 : Double(1, 4096) = aten::expand(%14, %53, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %60 : Double(1, 4096) = aten::addmm(%59, %57, %58, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %61 : Double(1, 4096) = aten::threshold(%60, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[5]
-  %62 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %63 : int = prim::Constant[value=1000](), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %64 : int[] = prim::ListConstruct(%21, %63), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %65 : Double(1, 1000) = aten::expand(%16, %64, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %66 : Double(1, 1000) = aten::addmm(%65, %61, %62, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
-  return (%66);
+  %23 : bool = prim::Constant[value=0](), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %24 : int = prim::Constant[value=0](), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %25 : int[] = prim::ListConstruct(%24, %24), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %26 : bool = prim::Constant[value=1](), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %27 : Double(1, 64, 55, 55) = aten::_convolution(%0, %1, %2, %18, %20, %22, %23, %25, %21, %23, %23, %26), scope: AlexNet/Sequential[features]/Conv2d[0]
+  %28 : Double(1, 64, 55, 55) = aten::threshold(%27, %24, %24), scope: AlexNet/Sequential[features]/ReLU[1]
+  %29 : int = prim::Constant[value=3](), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+  %30 : int[] = prim::ListConstruct(%29, %29), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+  %31 : Double(1, 64, 27, 27), %32 : Long(1, 64, 27, 27) = aten::max_pool2d_with_indices(%28, %30, %20, %25, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[2]
+  %33 : Double(1, 192, 27, 27) = aten::_convolution(%31, %3, %4, %22, %20, %22, %23, %25, %21, %23, %23, %26), scope: AlexNet/Sequential[features]/Conv2d[3]
+  %34 : Double(1, 192, 27, 27) = aten::threshold(%33, %24, %24), scope: AlexNet/Sequential[features]/ReLU[4]
+  %35 : Double(1, 192, 13, 13), %36 : Long(1, 192, 13, 13) = aten::max_pool2d_with_indices(%34, %30, %20, %25, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[5]
+  %37 : Double(1, 384, 13, 13) = aten::_convolution(%35, %5, %6, %22, %22, %22, %23, %25, %21, %23, %23, %26), scope: AlexNet/Sequential[features]/Conv2d[6]
+  %38 : Double(1, 384, 13, 13) = aten::threshold(%37, %24, %24), scope: AlexNet/Sequential[features]/ReLU[7]
+  %39 : Double(1, 256, 13, 13) = aten::_convolution(%38, %7, %8, %22, %22, %22, %23, %25, %21, %23, %23, %26), scope: AlexNet/Sequential[features]/Conv2d[8]
+  %40 : Double(1, 256, 13, 13) = aten::threshold(%39, %24, %24), scope: AlexNet/Sequential[features]/ReLU[9]
+  %41 : Double(1, 256, 13, 13) = aten::_convolution(%40, %9, %10, %22, %22, %22, %23, %25, %21, %23, %23, %26), scope: AlexNet/Sequential[features]/Conv2d[10]
+  %42 : Double(1, 256, 13, 13) = aten::threshold(%41, %24, %24), scope: AlexNet/Sequential[features]/ReLU[11]
+  %43 : Double(1, 256, 6, 6), %44 : Long(1, 256, 6, 6) = aten::max_pool2d_with_indices(%42, %30, %20, %25, %22, %23), scope: AlexNet/Sequential[features]/MaxPool2d[12]
+  %45 : int = aten::size(%43, %24), scope: AlexNet
+  %46 : Long() = prim::NumToTensor(%45), scope: AlexNet
+  %47 : int = prim::TensorToNum(%46), scope: AlexNet
+  %48 : int = prim::Constant[value=9216](), scope: AlexNet
+  %49 : int[] = prim::ListConstruct(%47, %48), scope: AlexNet
+  %50 : Double(1, 9216) = aten::view(%43, %49), scope: AlexNet
+  %51 : float = prim::Constant[value=0.5](), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %52 : Double(1!, 9216) = aten::dropout(%50, %51, %26), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %53 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %54 : int = prim::Constant[value=4096](), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %55 : int[] = prim::ListConstruct(%21, %54), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %56 : Double(1, 4096) = aten::expand(%12, %55, %26), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %57 : Double(1, 4096) = aten::addmm(%56, %52, %53, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %58 : Double(1, 4096) = aten::threshold(%57, %24, %24), scope: AlexNet/Sequential[classifier]/ReLU[2]
+  %59 : Double(1!, 4096) = aten::dropout(%58, %51, %26), scope: AlexNet/Sequential[classifier]/Dropout[3]
+  %60 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %61 : Double(1, 4096) = aten::expand(%14, %55, %26), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %62 : Double(1, 4096) = aten::addmm(%61, %59, %60, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %63 : Double(1, 4096) = aten::threshold(%62, %24, %24), scope: AlexNet/Sequential[classifier]/ReLU[5]
+  %64 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %65 : int = prim::Constant[value=1000](), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %66 : int[] = prim::ListConstruct(%21, %65), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %67 : Double(1, 1000) = aten::expand(%16, %66, %26), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %68 : Double(1, 1000) = aten::addmm(%67, %63, %64, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
+  return (%68);
 }
diff --git a/test/expect/TestJit.test_constant_prop_if_constant.expect b/test/expect/TestJit.test_constant_prop_if_constant.expect
index d373c1ee1204b3..d1e11f4ab70cbf 100644
--- a/test/expect/TestJit.test_constant_prop_if_constant.expect
+++ b/test/expect/TestJit.test_constant_prop_if_constant.expect
@@ -21,11 +21,8 @@ graph(%a : Dynamic
     }
   %c0.5 : int = aten::add(%c0.4, %c2.1)
   %11 : int = prim::Constant[value=5]()
-  %12 : int = prim::Constant[value=1]()
-  %13 : Dynamic = aten::add(%a, %c0.5, %12)
-  %14 : int = prim::Constant[value=1]()
-  %15 : Dynamic = aten::add(%13, %c1, %14)
-  %16 : int = prim::Constant[value=1]()
-  %17 : Dynamic = aten::add(%15, %11, %16)
-  return (%17);
+  %12 : Dynamic = aten::add(%a, %c0.5, %c2.1)
+  %13 : Dynamic = aten::add(%12, %c1, %c2.1)
+  %14 : Dynamic = aten::add(%13, %11, %c2.1)
+  return (%14);
 }
diff --git a/test/expect/TestJit.test_constant_prop_loop_constant.expect b/test/expect/TestJit.test_constant_prop_loop_constant.expect
index f5bc3f57205ac3..d2d1e538016123 100644
--- a/test/expect/TestJit.test_constant_prop_loop_constant.expect
+++ b/test/expect/TestJit.test_constant_prop_loop_constant.expect
@@ -1,19 +1,17 @@
 graph() {
-  %b.4 : int = prim::Constant[value=2]()
-  %b.2 : int = prim::Constant[value=1]()
-  %2 : int = prim::Constant[value=2147483647]()
+  %0 : bool = prim::Constant[value=0]()
+  %1 : bool = prim::Constant[value=1]()
   %b.1 : int = prim::Constant[value=0]()
-  %4 : bool = prim::Constant[value=1]()
-  %b.3 : int = prim::Loop(%2, %4, %b.1)
-    block0(%6 : int, %7 : int) {
-      %8 : bool = prim::Constant[value=1]()
-      -> (%8, %b.2)
+  %3 : int = prim::Constant[value=2147483647]()
+  %b.2 : int = prim::Constant[value=1]()
+  %b.4 : int = prim::Constant[value=2]()
+  %b.3 : int = prim::Loop(%3, %1, %b.1)
+    block0(%7 : int, %8 : int) {
+      -> (%1, %b.2)
     }
-  %9 : bool = prim::Constant[value=0]()
-  %b : int = prim::Loop(%2, %9, %b.3)
-    block0(%11 : int, %12 : int) {
-      %13 : bool = prim::Constant[value=0]()
-      -> (%13, %b.4)
+  %b : int = prim::Loop(%3, %0, %b.3)
+    block0(%10 : int, %11 : int) {
+      -> (%0, %b.4)
     }
   return (%b);
 }
diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
index 6f72acc4c8483e..7e5aa5c766fc7a 100644
--- a/test/expect/TestJit.test_constant_prop_print.expect
+++ b/test/expect/TestJit.test_constant_prop_print.expect
@@ -1,8 +1,8 @@
 graph(%input_tensor : Dynamic) {
-  %1 : int = prim::Constant[value=6]()
-   = prim::Print(%1)
-  %2 : int = prim::Constant[value=8]()
-  %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%input_tensor, %2, %3)
+  %1 : int = prim::Constant[value=1]()
+  %2 : int = prim::Constant[value=6]()
+   = prim::Print(%2)
+  %3 : int = prim::Constant[value=8]()
+  %4 : Dynamic = aten::add(%input_tensor, %3, %1)
   return (%4);
 }
diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect
index ec7769c5f195d5..e8b912e55c3eb1 100644
--- a/test/expect/TestJit.test_constant_prop_rand.expect
+++ b/test/expect/TestJit.test_constant_prop_rand.expect
@@ -1,11 +1,11 @@
 graph() {
-  %0 : int = prim::Constant[value=2]()
-  %1 : int[] = prim::Constant[value=[3]]()
-  %2 : int = prim::Constant[value=6]()
-  %3 : int = prim::Constant[value=0]()
-  %4 : int[] = prim::Constant[value=[0, -1]]()
-  %a : Dynamic = aten::randn(%1, %2, %3, %4)
-  %6 : int = prim::Constant[value=1]()
-  %b : Dynamic = aten::add(%a, %0, %6)
+  %0 : int = prim::Constant[value=1]()
+  %1 : int[] = prim::Constant[value=[0, -1]]()
+  %2 : int = prim::Constant[value=0]()
+  %3 : int = prim::Constant[value=6]()
+  %4 : int = prim::Constant[value=2]()
+  %5 : int[] = prim::Constant[value=[3]]()
+  %a : Dynamic = aten::randn(%5, %3, %2, %1)
+  %b : Dynamic = aten::add(%a, %4, %0)
   return (%b);
 }
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
index 71cf099a54a663..58e8dae26c86b3 100644
--- a/test/expect/TestJit.test_constant_prop_simple.expect
+++ b/test/expect/TestJit.test_constant_prop_simple.expect
@@ -1,6 +1,6 @@
 graph(%input_tensor : Dynamic) {
-  %1 : int = prim::Constant[value=8]()
-  %2 : int = prim::Constant[value=1]()
-  %3 : Dynamic = aten::add(%input_tensor, %1, %2)
+  %1 : int = prim::Constant[value=1]()
+  %2 : int = prim::Constant[value=8]()
+  %3 : Dynamic = aten::add(%input_tensor, %2, %1)
   return (%3);
 }
diff --git a/test/expect/TestJit.test_decompose_addmm.expect b/test/expect/TestJit.test_decompose_addmm.expect
index 1785fdd65a0e79..fb6a03866de66c 100644
--- a/test/expect/TestJit.test_decompose_addmm.expect
+++ b/test/expect/TestJit.test_decompose_addmm.expect
@@ -3,23 +3,21 @@ graph(%mat : Dynamic
       %mat2 : Dynamic
       %alpha : Dynamic
       %beta : Dynamic) {
-  %5 : float = prim::Constant[value=2]()
+  %5 : int = prim::Constant[value=1]()
   %6 : float = prim::Constant[value=4.2]()
-  %7 : Dynamic = aten::mm(%mat1, %mat2)
-  %8 : int = prim::Constant[value=1]()
-  %9 : Dynamic = aten::add(%mat, %7, %8)
-  %10 : Dynamic = aten::mm(%mat1, %mat2)
-  %11 : int = prim::Constant[value=1]()
-  %12 : Dynamic = aten::add(%mat, %10, %11)
-  %c : Dynamic = aten::addmm(%mat, %mat1, %mat2, %5, %6)
-  %14 : int = prim::TensorToNum(%alpha)
-  %15 : int = prim::TensorToNum(%beta)
-  %d : Dynamic = aten::addmm(%mat, %mat1, %mat2, %15, %14)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Dynamic = aten::add(%9, %12, %17)
-  %19 : int = prim::Constant[value=1]()
-  %20 : Dynamic = aten::add(%18, %c, %19)
-  %21 : int = prim::Constant[value=1]()
-  %22 : Dynamic = aten::add(%20, %d, %21)
-  return (%22);
+  %7 : float = prim::Constant[value=2]()
+  %8 : Dynamic = aten::mm(%mat1, %mat2)
+  %9 : int = prim::Constant[value=1]()
+  %10 : Dynamic = aten::add(%mat, %8, %9)
+  %11 : Dynamic = aten::mm(%mat1, %mat2)
+  %12 : int = prim::Constant[value=1]()
+  %13 : Dynamic = aten::add(%mat, %11, %12)
+  %c : Dynamic = aten::addmm(%mat, %mat1, %mat2, %7, %6)
+  %15 : int = prim::TensorToNum(%alpha)
+  %16 : int = prim::TensorToNum(%beta)
+  %d : Dynamic = aten::addmm(%mat, %mat1, %mat2, %16, %15)
+  %18 : Dynamic = aten::add(%10, %13, %5)
+  %19 : Dynamic = aten::add(%18, %c, %5)
+  %20 : Dynamic = aten::add(%19, %d, %5)
+  return (%20);
 }
diff --git a/test/expect/TestJit.test_export_expand_aten_fallback.expect b/test/expect/TestJit.test_export_expand_aten_fallback.expect
index 8baa082f72dbb3..4047f9c16b549b 100644
--- a/test/expect/TestJit.test_export_expand_aten_fallback.expect
+++ b/test/expect/TestJit.test_export_expand_aten_fallback.expect
@@ -6,27 +6,26 @@ ModelProto {
     GraphProto {
       name: "torch-jit-export"
       inputs: [{name: "y.1", type:Tensor dims: 3 4 1}]
-      outputs: [{name: "5", type:Tensor dims: 3 4 4}]
+      outputs: [{name: "6", type:Tensor dims: 3 4 4}]
       initializers: []
       nodes: [
         Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Loop", inputs: [3,4,y.1], outputs: [5], attributes: [{ name: 'body', type: graph, value:
+        Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Loop", inputs: [3,2,y.1], outputs: [6], attributes: [{ name: 'body', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
-              inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "8", type:Tensor dims: }]
-              outputs: [{name: "15", type:Tensor dims: },{name: "16", type:Tensor dims: }]
+              inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "9", type:Tensor dims: }]
+              outputs: [{name: "2", type:Tensor dims: },{name: "15", type:Tensor dims: }]
               initializers: []
               nodes: [
-                Node {type: "Unsqueeze", inputs: [2], outputs: [9], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "Unsqueeze", inputs: [1], outputs: [10], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "Unsqueeze", inputs: [i], outputs: [11], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "Concat", inputs: [9,10,11], outputs: [12], attributes: [{ name: 'axis', type: int, value: 0}]},
-                Node {type: "Constant", inputs: [], outputs: [13], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "ATen", inputs: [y.1,12,13], outputs: [16], attributes: [{ name: 'operator', type: string, value: 'expand'}]},
-                Node {type: "Constant", inputs: [], outputs: [15], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+                Node {type: "Unsqueeze", inputs: [4], outputs: [10], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [5], outputs: [11], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [i], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Concat", inputs: [10,11,12], outputs: [13], attributes: [{ name: 'axis', type: int, value: 0}]},
+                Node {type: "ATen", inputs: [y.1,13,1], outputs: [15], attributes: [{ name: 'operator', type: string, value: 'expand'}]}
               ]
             }
 
diff --git a/test/expect/TestJit.test_lstm_fusion_concat_cuda.expect b/test/expect/TestJit.test_lstm_fusion_concat_cuda.expect
index a4457584f37edc..78eb85e2472442 100644
--- a/test/expect/TestJit.test_lstm_fusion_concat_cuda.expect
+++ b/test/expect/TestJit.test_lstm_fusion_concat_cuda.expect
@@ -5,13 +5,13 @@ graph(%0 : Float(*, *)
       %4 : Float(*, *)
       %5 : Float(*)
       %6 : Float(*)) {
-  %7 : Float(*, *) = aten::t(%3)
-  %8 : Float(*, *) = aten::mm(%0, %7)
-  %9 : int = prim::Constant[value=1]()
-  %10 : Float(*, *) = aten::add(%5, %8, %9)
+  %7 : int = prim::Constant[value=1]()
+  %8 : Float(*, *) = aten::t(%3)
+  %9 : Float(*, *) = aten::mm(%0, %8)
+  %10 : Float(*, *) = aten::add(%5, %9, %7)
   %11 : Float(*, *) = aten::t(%4)
   %12 : Float(*, *) = aten::mm(%1, %11)
-  %13 : Float(*, *) = aten::add(%6, %12, %9)
+  %13 : Float(*, *) = aten::add(%6, %12, %7)
   %14 : Dynamic[] = prim::ListConstruct(%10, %13)
   %15 : Dynamic[] = aten::broadcast_tensors(%14)
   %16 : Dynamic, %17 : Dynamic = prim::ListUnpack(%15)
diff --git a/test/expect/TestJit.test_lstm_fusion_cuda.expect b/test/expect/TestJit.test_lstm_fusion_cuda.expect
index 963908af49504c..bc3660650cb16e 100644
--- a/test/expect/TestJit.test_lstm_fusion_cuda.expect
+++ b/test/expect/TestJit.test_lstm_fusion_cuda.expect
@@ -5,13 +5,13 @@ graph(%0 : Float(*, *)
       %4 : Float(*, *)
       %5 : Float(*)
       %6 : Float(*)) {
-  %7 : Float(*, *) = aten::t(%3)
-  %8 : Float(*, *) = aten::mm(%0, %7)
-  %9 : int = prim::Constant[value=1]()
-  %10 : Float(*, *) = aten::add(%5, %8, %9)
+  %7 : int = prim::Constant[value=1]()
+  %8 : Float(*, *) = aten::t(%3)
+  %9 : Float(*, *) = aten::mm(%0, %8)
+  %10 : Float(*, *) = aten::add(%5, %9, %7)
   %11 : Float(*, *) = aten::t(%4)
   %12 : Float(*, *) = aten::mm(%1, %11)
-  %13 : Float(*, *) = aten::add(%6, %12, %9)
+  %13 : Float(*, *) = aten::add(%6, %12, %7)
   %14 : Dynamic[] = prim::ListConstruct(%10, %13)
   %15 : Dynamic[] = aten::broadcast_tensors(%14)
   %16 : Dynamic, %17 : Dynamic = prim::ListUnpack(%15)
diff --git a/test/expect/TestScript.test_call_python_fn_from_script_fn.expect b/test/expect/TestScript.test_call_python_fn_from_script_fn.expect
index da94046e583537..d34325ecdbe84c 100644
--- a/test/expect/TestScript.test_call_python_fn_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_python_fn_from_script_fn.expect
@@ -1,7 +1,6 @@
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
   %2 : Dynamic = ^python_fn()(%x)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%2, %1, %3)
-  return (%4);
+  %3 : Dynamic = aten::add(%2, %1, %1)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_call_python_mod_from_script_fn.expect b/test/expect/TestScript.test_call_python_mod_from_script_fn.expect
index ab5c6c19cba034..9251057b28c71d 100644
--- a/test/expect/TestScript.test_call_python_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_python_mod_from_script_fn.expect
@@ -1,7 +1,6 @@
 graph(%x : Dynamic) {
   %2 : int = prim::Constant[value=1]()
   %1 : Dynamic = ^<python_value>()(%x)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%1, %2, %3)
+  %4 : Dynamic = aten::add(%1, %2, %2)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
index b2159144f798fc..2379d692a52455 100644
--- a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect
@@ -1,7 +1,6 @@
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
   %2 : Dynamic = aten::neg(%x)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%2, %1, %3)
-  return (%4);
+  %3 : Dynamic = aten::add(%2, %1, %1)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
index 3478376f829c85..6c8be4909ccd3a 100644
--- a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect
@@ -1,14 +1,13 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=1]()
-  %2 : int = prim::Constant[value=3]()
-  %3 : int = prim::Constant[value=4]()
-  %4 : int[] = prim::ListConstruct(%3, %2)
-  %5 : int = prim::Constant[value=6]()
-  %6 : int = prim::Constant[value=0]()
-  %7 : int[] = prim::Constant[value=[0, -1]]()
-  %8 : Dynamic = aten::zeros(%4, %5, %6, %7)
+  %1 : int = prim::Constant[value=3]()
+  %2 : int = prim::Constant[value=4]()
+  %3 : int = prim::Constant[value=6]()
+  %4 : int = prim::Constant[value=0]()
+  %5 : int[] = prim::Constant[value=[0, -1]]()
+  %6 : int = prim::Constant[value=1]()
+  %7 : int[] = prim::ListConstruct(%2, %1)
+  %8 : Dynamic = aten::zeros(%7, %3, %4, %5)
   %9 : Dynamic = aten::mm(%x, %8)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Dynamic = aten::add(%9, %1, %10)
-  return (%11);
+  %10 : Dynamic = aten::add(%9, %6, %6)
+  return (%10);
 }
diff --git a/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect b/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect
index 8b0b447e56c940..e0a9cf2216fc56 100644
--- a/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_traced_fn_from_script_fn.expect
@@ -1,7 +1,6 @@
 graph(%x : Dynamic) {
   %2 : int = prim::Constant[value=1]()
   %1 : Double(3, 4) = aten::neg(%x)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Dynamic = aten::add(%1, %2, %3)
+  %4 : Dynamic = aten::add(%1, %2, %2)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
index 4ebe25af42c778..ae5b1b7e45759f 100644
--- a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
+++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect
@@ -1,14 +1,13 @@
 graph(%x : Dynamic) {
-  %9 : int = prim::Constant[value=1]()
-  %1 : int = prim::Constant[value=4]()
+  %6 : int[] = prim::Constant[value=[0, -1]]()
+  %5 : int = prim::Constant[value=0]()
+  %4 : int = prim::Constant[value=7]()
   %2 : int = prim::Constant[value=3]()
+  %1 : int = prim::Constant[value=4]()
+  %9 : int = prim::Constant[value=1]()
   %3 : int[] = prim::ListConstruct(%1, %2)
-  %4 : int = prim::Constant[value=7]()
-  %5 : int = prim::Constant[value=0]()
-  %6 : int[] = prim::Constant[value=[0, -1]]()
   %7 : Double(4, 3) = aten::zeros(%3, %4, %5, %6)
   %8 : Double(3, 3) = aten::mm(%x, %7)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Dynamic = aten::add(%8, %9, %10)
+  %11 : Dynamic = aten::add(%8, %9, %9)
   return (%11);
 }
diff --git a/test/expect/TestScript.test_constant_pooling.expect b/test/expect/TestScript.test_constant_pooling.expect
new file mode 100644
index 00000000000000..d01001c1ff1c75
--- /dev/null
+++ b/test/expect/TestScript.test_constant_pooling.expect
@@ -0,0 +1,33 @@
+graph(%cond : Dynamic) {
+  %1 : int[] = prim::Constant[value=[1]]()
+  %2 : int[] = prim::Constant[value=[0]]()
+  %3 : int = prim::Constant[value=3]()
+  %4 : Float(2) = prim::Constant[value= 4  4 [ CPUFloatType{2} ]]()
+  %5 : Float(2) = prim::Constant[value= 1  1 [ CPUFloatType{2} ]]()
+  %c.1 : int = prim::Constant[value=0]()
+  %a : int = prim::Constant[value=1]()
+  %d : string = prim::Constant[value="abc"]()
+  %e : string = prim::Constant[value="bcd"]()
+  %10 : int = prim::Constant[value=6]()
+  %11 : int[] = prim::Constant[value=[0, -1]]()
+  %12 : bool = prim::TensorToBool(%cond)
+  %c : int, %y : Dynamic = prim::If(%12)
+    block0() {
+      -> (%3, %4)
+    }
+    block1() {
+      %y.2 : Dynamic = aten::rand(%2, %10, %c.1, %11)
+      %16 : bool = prim::TensorToBool(%cond)
+      %y.4 : Dynamic = prim::If(%16)
+        block0() {
+          %y.3 : Dynamic = aten::rand(%1, %10, %c.1, %11)
+          -> (%y.3)
+        }
+        block1() {
+          -> (%y.2)
+        }
+       = prim::Print(%d, %e, %d, %5, %y.4, %5)
+      -> (%c.1, %y.4)
+    }
+  return (%a, %3, %c, %5);
+}
diff --git a/test/expect/TestScript.test_erase_number_types.expect b/test/expect/TestScript.test_erase_number_types.expect
index ad75ae5c895d28..9d991080522cf9 100644
--- a/test/expect/TestScript.test_erase_number_types.expect
+++ b/test/expect/TestScript.test_erase_number_types.expect
@@ -1,12 +1,10 @@
 graph(%a : Dynamic) {
-  %1 : Long() = prim::Constant[value={3}]()
+  %1 : Long() = prim::Constant[value={7}]()
   %2 : Long() = prim::Constant[value={1}]()
-  %3 : Long() = prim::Constant[value={7}]()
-  %4 : Long() = aten::add(%3, %2)
-  %b : Long() = aten::add(%4, %1)
-  %6 : Long() = prim::Constant[value={1}]()
-  %c.1 : Dynamic = aten::add(%a, %b, %6)
-  %8 : Long() = prim::Constant[value={1}]()
-  %c : Dynamic = aten::add(%c.1, %b, %8)
+  %3 : Long() = prim::Constant[value={3}]()
+  %4 : Long() = aten::add(%1, %2)
+  %b : Long() = aten::add(%4, %3)
+  %c.1 : Dynamic = aten::add(%a, %b, %2)
+  %c : Dynamic = aten::add(%c.1, %b, %2)
   return (%c);
 }
diff --git a/test/expect/TestScript.test_export_dynamic_slice.expect b/test/expect/TestScript.test_export_dynamic_slice.expect
index 7182fec515ae74..84e06b2e5763fb 100644
--- a/test/expect/TestScript.test_export_dynamic_slice.expect
+++ b/test/expect/TestScript.test_export_dynamic_slice.expect
@@ -11,25 +11,23 @@ ModelProto {
       nodes: [
         Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Gather", inputs: [x,2], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]},
-        Node {type: "Shape", inputs: [x], outputs: [4], attributes: []},
-        Node {type: "Gather", inputs: [4,1], outputs: [5], attributes: [{ name: 'axis', type: int, value: 0}]},
-        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Loop", inputs: [5,6,3], outputs: [7], attributes: [{ name: 'body', type: graph, value:
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Gather", inputs: [x,2], outputs: [4], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Shape", inputs: [x], outputs: [5], attributes: []},
+        Node {type: "Gather", inputs: [5,3], outputs: [6], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Loop", inputs: [6,1,4], outputs: [7], attributes: [{ name: 'body', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "10", type:Tensor dims: }]
-              outputs: [{name: "18", type:Tensor dims: },{name: "17", type:Tensor dims: }]
+              outputs: [{name: "1", type:Tensor dims: },{name: "16", type:Tensor dims: }]
               initializers: []
               nodes: [
-                Node {type: "Constant", inputs: [], outputs: [11], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-                Node {type: "Unsqueeze", inputs: [2], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "Unsqueeze", inputs: [i], outputs: [13], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "Unsqueeze", inputs: [11], outputs: [14], attributes: [{ name: 'axes', type: ints, values: [0]}]},
-                Node {type: "DynamicSlice", inputs: [x,12,13,14], outputs: [15], attributes: []},
-                Node {type: "ReduceSum", inputs: [15], outputs: [16], attributes: [{ name: 'axes', type: ints, values: [0]},{ name: 'keepdims', type: int, value: 0}]},
-                Node {type: "Add", inputs: [10,16], outputs: [17], attributes: []},
-                Node {type: "Constant", inputs: [], outputs: [18], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+                Node {type: "Unsqueeze", inputs: [2], outputs: [11], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [i], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [2], outputs: [13], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "DynamicSlice", inputs: [x,11,12,13], outputs: [14], attributes: []},
+                Node {type: "ReduceSum", inputs: [14], outputs: [15], attributes: [{ name: 'axes', type: ints, values: [0]},{ name: 'keepdims', type: int, value: 0}]},
+                Node {type: "Add", inputs: [10,15], outputs: [16], attributes: []}
               ]
             }
 
diff --git a/test/expect/TestScript.test_if_list.expect b/test/expect/TestScript.test_if_list.expect
index 93c942d4dc1eba..ff2d10edb61bb9 100644
--- a/test/expect/TestScript.test_if_list.expect
+++ b/test/expect/TestScript.test_if_list.expect
@@ -1,6 +1,7 @@
 graph(%x : Double(*, *)) {
-  %1 : bool = prim::Constant[value=1]()
-  %c : Dynamic[] = prim::If(%1)
+  %1 : int = prim::Constant[value=0]()
+  %2 : bool = prim::Constant[value=1]()
+  %c : Dynamic[] = prim::If(%2)
     block0() {
       %c.1 : Dynamic[] = prim::ListConstruct(%x, %x)
       -> (%c.1)
@@ -9,7 +10,6 @@ graph(%x : Double(*, *)) {
       %c.2 : Dynamic[] = prim::ListConstruct(%x, %x, %x)
       -> (%c.2)
     }
-  %5 : int = prim::Constant[value=0]()
-  %6 : Dynamic = aten::cat(%c, %5)
+  %6 : Dynamic = aten::cat(%c, %1)
   return (%6);
 }
diff --git a/test/expect/TestScript.test_logical_short_circuit.expect b/test/expect/TestScript.test_logical_short_circuit.expect
index 0a4569c2b445b5..37abfdfbb6ff6a 100644
--- a/test/expect/TestScript.test_logical_short_circuit.expect
+++ b/test/expect/TestScript.test_logical_short_circuit.expect
@@ -1,34 +1,32 @@
 graph(%t : Dynamic) {
-  %c1.2 : int = prim::Constant[value=0]()
+  %1 : bool = prim::Constant[value=1]()
+  %2 : bool = prim::Constant[value=0]()
   %c1.1 : int = prim::Constant[value=1]()
-  %3 : bool = prim::Constant[value=0]()
-  %4 : bool = prim::If(%3)
+  %c1.2 : int = prim::Constant[value=0]()
+  %5 : bool = prim::If(%2)
     block0() {
-      %5 : int = prim::Constant[value=0]()
-      %6 : Dynamic = aten::select(%t, %5, %c1.1)
+      %6 : Dynamic = aten::select(%t, %c1.2, %c1.1)
       %7 : bool = prim::TensorToBool(%6)
       -> (%7)
     }
     block1() {
-      -> (%3)
+      -> (%2)
     }
-  %8 : bool = prim::If(%4)
+  %8 : bool = prim::If(%5)
     block0() {
-      -> (%4)
+      -> (%5)
     }
     block1() {
-      %9 : bool = prim::Constant[value=1]()
-      %10 : bool = prim::If(%9)
+      %9 : bool = prim::If(%1)
         block0() {
-          -> (%9)
+          -> (%1)
         }
         block1() {
-          %11 : int = prim::Constant[value=0]()
-          %12 : Dynamic = aten::select(%t, %11, %c1.1)
-          %13 : bool = prim::TensorToBool(%12)
-          -> (%13)
+          %10 : Dynamic = aten::select(%t, %c1.2, %c1.1)
+          %11 : bool = prim::TensorToBool(%10)
+          -> (%11)
         }
-      -> (%10)
+      -> (%9)
     }
   %c1 : int = prim::If(%8)
     block0() {
diff --git a/test/expect/TestScript.test_loop_unroll_unused_counter.expect b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
index feb204a92579b6..adfe139728ecf3 100644
--- a/test/expect/TestScript.test_loop_unroll_unused_counter.expect
+++ b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
@@ -1,31 +1,29 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=1]()
+  %1 : bool = prim::Constant[value=1]()
   %y.1 : int = prim::Constant[value=0]()
-  %3 : int = prim::TensorToNum(%x)
-  %4 : bool = prim::Constant[value=1]()
+  %3 : int = prim::Constant[value=1]()
+  %4 : int = prim::TensorToNum(%x)
   %5 : int = prim::Constant[value=8]()
-  %6 : int = aten::floordiv(%3, %5)
+  %6 : int = aten::floordiv(%4, %5)
   %7 : int = prim::Constant[value=8]()
   %8 : int = aten::mul(%6, %7)
-  %9 : int = aten::sub(%3, %8)
-  %y.3 : int = prim::Loop(%6, %4, %y.1)
+  %9 : int = aten::sub(%4, %8)
+  %y.3 : int = prim::Loop(%6, %1, %y.1)
     block0(%i.1 : int, %12 : int) {
-      %y.12 : int = aten::add(%12, %1)
-      %y.5 : int = aten::add(%y.12, %1)
-      %y.6 : int = aten::add(%y.5, %1)
-      %y.7 : int = aten::add(%y.6, %1)
-      %y.8 : int = aten::add(%y.7, %1)
-      %y.9 : int = aten::add(%y.8, %1)
-      %y.10 : int = aten::add(%y.9, %1)
-      %y.11 : int = aten::add(%y.10, %1)
-      %21 : bool = prim::Constant[value=1]()
-      -> (%21, %y.11)
+      %y.12 : int = aten::add(%12, %3)
+      %y.5 : int = aten::add(%y.12, %3)
+      %y.6 : int = aten::add(%y.5, %3)
+      %y.7 : int = aten::add(%y.6, %3)
+      %y.8 : int = aten::add(%y.7, %3)
+      %y.9 : int = aten::add(%y.8, %3)
+      %y.10 : int = aten::add(%y.9, %3)
+      %y.11 : int = aten::add(%y.10, %3)
+      -> (%1, %y.11)
     }
-  %y : int = prim::Loop(%9, %4, %y.3)
-    block0(%i : int, %24 : int) {
-      %y.4 : int = aten::add(%24, %1)
-      %26 : bool = prim::Constant[value=1]()
-      -> (%26, %y.4)
+  %y : int = prim::Loop(%9, %1, %y.3)
+    block0(%i : int, %23 : int) {
+      %y.4 : int = aten::add(%23, %3)
+      -> (%1, %y.4)
     }
   return (%y);
 }
diff --git a/test/expect/TestScript.test_loop_unrolling.expect b/test/expect/TestScript.test_loop_unrolling.expect
index 5f3e61a4ce34b6..3e913eee8b8638 100644
--- a/test/expect/TestScript.test_loop_unrolling.expect
+++ b/test/expect/TestScript.test_loop_unrolling.expect
@@ -1,14 +1,14 @@
 graph(%x : Dynamic) {
+  %1 : bool = prim::Constant[value=1]()
   %y.1 : int = prim::Constant[value=0]()
-  %2 : int = prim::TensorToNum(%x)
-  %3 : bool = prim::Constant[value=1]()
+  %3 : int = prim::TensorToNum(%x)
   %4 : int = prim::Constant[value=0]()
   %5 : int = prim::Constant[value=8]()
-  %6 : int = aten::floordiv(%2, %5)
+  %6 : int = aten::floordiv(%3, %5)
   %7 : int = prim::Constant[value=8]()
   %8 : int = aten::mul(%6, %7)
-  %9 : int = aten::sub(%2, %8)
-  %10 : Dynamic, %y.3 : int = prim::Loop(%6, %3, %4, %y.1)
+  %9 : int = aten::sub(%3, %8)
+  %10 : Dynamic, %y.3 : int = prim::Loop(%6, %1, %4, %y.1)
     block0(%i.1 : int, %13 : int, %14 : int) {
       %y.12 : int = aten::add(%14, %13)
       %16 : int = prim::Constant[value=1]()
@@ -32,18 +32,16 @@ graph(%x : Dynamic) {
       %34 : int = prim::Constant[value=1]()
       %35 : int = aten::add(%32, %34)
       %y.11 : int = aten::add(%y.10, %35)
-      %37 : bool = prim::Constant[value=1]()
-      %38 : int = prim::Constant[value=1]()
-      %39 : int = aten::add(%35, %38)
-      -> (%37, %39, %y.11)
+      %37 : int = prim::Constant[value=1]()
+      %38 : int = aten::add(%35, %37)
+      -> (%1, %38, %y.11)
     }
-  %40 : Dynamic, %y : int = prim::Loop(%9, %3, %10, %y.3)
-    block0(%i : int, %43 : int, %44 : int) {
-      %y.4 : int = aten::add(%44, %43)
-      %46 : bool = prim::Constant[value=1]()
-      %47 : int = prim::Constant[value=1]()
-      %48 : int = aten::add(%43, %47)
-      -> (%46, %48, %y.4)
+  %39 : Dynamic, %y : int = prim::Loop(%9, %1, %10, %y.3)
+    block0(%i : int, %42 : int, %43 : int) {
+      %y.4 : int = aten::add(%43, %42)
+      %45 : int = prim::Constant[value=1]()
+      %46 : int = aten::add(%42, %45)
+      -> (%1, %46, %y.4)
     }
   return (%y);
 }
diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_const.expect b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
index a08f3c99bdcf8c..7c151a653b7737 100644
--- a/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
+++ b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
@@ -1,15 +1,15 @@
 graph() {
-  %0 : int = prim::Constant[value=1]()
   %y.1 : int = prim::Constant[value=0]()
-  %y.11 : int = aten::add(%y.1, %0)
-  %y.2 : int = aten::add(%y.11, %0)
-  %y.3 : int = aten::add(%y.2, %0)
-  %y.4 : int = aten::add(%y.3, %0)
-  %y.5 : int = aten::add(%y.4, %0)
-  %y.6 : int = aten::add(%y.5, %0)
-  %y.7 : int = aten::add(%y.6, %0)
-  %y.8 : int = aten::add(%y.7, %0)
-  %y.9 : int = aten::add(%y.8, %0)
-  %y.10 : int = aten::add(%y.9, %0)
+  %1 : int = prim::Constant[value=1]()
+  %y.11 : int = aten::add(%y.1, %1)
+  %y.2 : int = aten::add(%y.11, %1)
+  %y.3 : int = aten::add(%y.2, %1)
+  %y.4 : int = aten::add(%y.3, %1)
+  %y.5 : int = aten::add(%y.4, %1)
+  %y.6 : int = aten::add(%y.5, %1)
+  %y.7 : int = aten::add(%y.6, %1)
+  %y.8 : int = aten::add(%y.7, %1)
+  %y.9 : int = aten::add(%y.8, %1)
+  %y.10 : int = aten::add(%y.9, %1)
   return (%y.10);
 }
diff --git a/test/expect/TestScript.test_loop_unrolling_nested.expect b/test/expect/TestScript.test_loop_unrolling_nested.expect
index 2668b111abba9b..94c6e0b5609f82 100644
--- a/test/expect/TestScript.test_loop_unrolling_nested.expect
+++ b/test/expect/TestScript.test_loop_unrolling_nested.expect
@@ -1,56 +1,52 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=10]()
+  %1 : bool = prim::Constant[value=1]()
   %y.1 : int = prim::Constant[value=0]()
-  %3 : bool = prim::Constant[value=1]()
-  %y : int = prim::Loop(%1, %3, %y.1)
+  %3 : int = prim::Constant[value=10]()
+  %y : int = prim::Loop(%3, %1, %y.1)
     block0(%i : int, %6 : int) {
       %7 : int = prim::TensorToNum(%x)
-      %8 : bool = prim::Constant[value=1]()
-      %9 : int = prim::Constant[value=0]()
-      %10 : int = prim::Constant[value=8]()
-      %11 : int = aten::floordiv(%7, %10)
-      %12 : int = prim::Constant[value=8]()
-      %13 : int = aten::mul(%11, %12)
-      %14 : int = aten::sub(%7, %13)
-      %15 : Dynamic, %y.4 : int = prim::Loop(%11, %8, %9, %6)
-        block0(%j.1 : int, %18 : int, %19 : int) {
-          %y.13 : int = aten::add(%19, %18)
-          %21 : int = prim::Constant[value=1]()
-          %22 : int = aten::add(%18, %21)
-          %y.6 : int = aten::add(%y.13, %22)
-          %24 : int = prim::Constant[value=1]()
-          %25 : int = aten::add(%22, %24)
-          %y.7 : int = aten::add(%y.6, %25)
-          %27 : int = prim::Constant[value=1]()
-          %28 : int = aten::add(%25, %27)
-          %y.8 : int = aten::add(%y.7, %28)
-          %30 : int = prim::Constant[value=1]()
-          %31 : int = aten::add(%28, %30)
-          %y.9 : int = aten::add(%y.8, %31)
-          %33 : int = prim::Constant[value=1]()
-          %34 : int = aten::add(%31, %33)
-          %y.10 : int = aten::add(%y.9, %34)
-          %36 : int = prim::Constant[value=1]()
-          %37 : int = aten::add(%34, %36)
-          %y.11 : int = aten::add(%y.10, %37)
-          %39 : int = prim::Constant[value=1]()
-          %40 : int = aten::add(%37, %39)
-          %y.12 : int = aten::add(%y.11, %40)
-          %42 : bool = prim::Constant[value=1]()
-          %43 : int = prim::Constant[value=1]()
-          %44 : int = aten::add(%40, %43)
-          -> (%42, %44, %y.12)
+      %8 : int = prim::Constant[value=0]()
+      %9 : int = prim::Constant[value=8]()
+      %10 : int = aten::floordiv(%7, %9)
+      %11 : int = prim::Constant[value=8]()
+      %12 : int = aten::mul(%10, %11)
+      %13 : int = aten::sub(%7, %12)
+      %14 : Dynamic, %y.4 : int = prim::Loop(%10, %1, %8, %6)
+        block0(%j.1 : int, %17 : int, %18 : int) {
+          %y.13 : int = aten::add(%18, %17)
+          %20 : int = prim::Constant[value=1]()
+          %21 : int = aten::add(%17, %20)
+          %y.6 : int = aten::add(%y.13, %21)
+          %23 : int = prim::Constant[value=1]()
+          %24 : int = aten::add(%21, %23)
+          %y.7 : int = aten::add(%y.6, %24)
+          %26 : int = prim::Constant[value=1]()
+          %27 : int = aten::add(%24, %26)
+          %y.8 : int = aten::add(%y.7, %27)
+          %29 : int = prim::Constant[value=1]()
+          %30 : int = aten::add(%27, %29)
+          %y.9 : int = aten::add(%y.8, %30)
+          %32 : int = prim::Constant[value=1]()
+          %33 : int = aten::add(%30, %32)
+          %y.10 : int = aten::add(%y.9, %33)
+          %35 : int = prim::Constant[value=1]()
+          %36 : int = aten::add(%33, %35)
+          %y.11 : int = aten::add(%y.10, %36)
+          %38 : int = prim::Constant[value=1]()
+          %39 : int = aten::add(%36, %38)
+          %y.12 : int = aten::add(%y.11, %39)
+          %41 : int = prim::Constant[value=1]()
+          %42 : int = aten::add(%39, %41)
+          -> (%1, %42, %y.12)
         }
-      %45 : Dynamic, %y.3 : int = prim::Loop(%14, %8, %15, %y.4)
-        block0(%j : int, %48 : int, %49 : int) {
-          %y.5 : int = aten::add(%49, %48)
-          %51 : bool = prim::Constant[value=1]()
-          %52 : int = prim::Constant[value=1]()
-          %53 : int = aten::add(%48, %52)
-          -> (%51, %53, %y.5)
+      %43 : Dynamic, %y.3 : int = prim::Loop(%13, %1, %14, %y.4)
+        block0(%j : int, %46 : int, %47 : int) {
+          %y.5 : int = aten::add(%47, %46)
+          %49 : int = prim::Constant[value=1]()
+          %50 : int = aten::add(%46, %49)
+          -> (%1, %50, %y.5)
         }
-      %54 : bool = prim::Constant[value=1]()
-      -> (%54, %y.3)
+      -> (%1, %y.3)
     }
   return (%y);
 }
diff --git a/test/expect/TestScript.test_math_numbers-float.expect b/test/expect/TestScript.test_math_numbers-float.expect
index acc1f1ce1fecd2..cf5e4756a39de1 100644
--- a/test/expect/TestScript.test_math_numbers-float.expect
+++ b/test/expect/TestScript.test_math_numbers-float.expect
@@ -1,6 +1,6 @@
 graph(%x : Dynamic) {
-  %1 : float = prim::Constant[value=3.1]()
-  %2 : float = prim::Constant[value=1.1]()
-  %3 : float = aten::add(%2, %1)
+  %1 : float = prim::Constant[value=1.1]()
+  %2 : float = prim::Constant[value=3.1]()
+  %3 : float = aten::add(%1, %2)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_math_numbers-int.expect b/test/expect/TestScript.test_math_numbers-int.expect
index fb5bb2096ceb10..64e4f319dbc166 100644
--- a/test/expect/TestScript.test_math_numbers-int.expect
+++ b/test/expect/TestScript.test_math_numbers-int.expect
@@ -1,6 +1,6 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=8]()
-  %2 : int = prim::Constant[value=7]()
-  %3 : int = aten::add(%2, %1)
+  %1 : int = prim::Constant[value=7]()
+  %2 : int = prim::Constant[value=8]()
+  %3 : int = aten::add(%1, %2)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_math_tensor_number.expect b/test/expect/TestScript.test_math_tensor_number.expect
index fb4b81bd00cba5..7ce088d9534490 100644
--- a/test/expect/TestScript.test_math_tensor_number.expect
+++ b/test/expect/TestScript.test_math_tensor_number.expect
@@ -1,6 +1,6 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=7]()
-  %2 : int = prim::Constant[value=1]()
-  %3 : Dynamic = aten::add(%x, %1, %2)
+  %1 : int = prim::Constant[value=1]()
+  %2 : int = prim::Constant[value=7]()
+  %3 : Dynamic = aten::add(%x, %2, %1)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_onnx_export_script_module_loop.expect b/test/expect/TestScript.test_onnx_export_script_module_loop.expect
index c3b982d1f6dfe5..3d967458306825 100644
--- a/test/expect/TestScript.test_onnx_export_script_module_loop.expect
+++ b/test/expect/TestScript.test_onnx_export_script_module_loop.expect
@@ -11,15 +11,14 @@ ModelProto {
       nodes: [
         Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "Loop", inputs: [1,2,x.1], outputs: [3], attributes: [{ name: 'body', type: graph, value:
+        Node {type: "Loop", inputs: [2,1,x.1], outputs: [3], attributes: [{ name: 'body', type: graph, value:
             GraphProto {
               name: "torch-jit-export1"
               inputs: [{name: "_", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "6", type:Tensor dims: }]
-              outputs: [{name: "8", type:Tensor dims: },{name: "7", type:Tensor dims: }]
+              outputs: [{name: "1", type:Tensor dims: },{name: "7", type:Tensor dims: }]
               initializers: []
               nodes: [
-                Node {type: "Add", inputs: [6,6], outputs: [7], attributes: []},
-                Node {type: "Constant", inputs: [], outputs: [8], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+                Node {type: "Add", inputs: [6,6], outputs: [7], attributes: []}
               ]
             }
 
diff --git a/test/expect/TestScript.test_onnx_export_script_truediv.expect b/test/expect/TestScript.test_onnx_export_script_truediv.expect
index 35d45d5b62fdd1..02b6b338ffc050 100644
--- a/test/expect/TestScript.test_onnx_export_script_truediv.expect
+++ b/test/expect/TestScript.test_onnx_export_script_truediv.expect
@@ -12,9 +12,9 @@ ModelProto {
         Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Shape", inputs: [x], outputs: [3], attributes: []},
-        Node {type: "Gather", inputs: [3,2], outputs: [4], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Gather", inputs: [3,1], outputs: [4], attributes: [{ name: 'axis', type: int, value: 0}]},
         Node {type: "Cast", inputs: [4], outputs: [5], attributes: [{ name: 'to', type: int, value: 1}]},
-        Node {type: "Cast", inputs: [1], outputs: [6], attributes: [{ name: 'to', type: int, value: 1}]},
+        Node {type: "Cast", inputs: [2], outputs: [6], attributes: [{ name: 'to', type: int, value: 1}]},
         Node {type: "Div", inputs: [5,6], outputs: [7], attributes: []},
         Node {type: "Add", inputs: [x,7], outputs: [8], attributes: []}
       ]
diff --git a/test/expect/TestScript.test_onnx_raw_export_script_truediv.expect b/test/expect/TestScript.test_onnx_raw_export_script_truediv.expect
index 2eb94a744dd047..4a0f29668870b2 100644
--- a/test/expect/TestScript.test_onnx_raw_export_script_truediv.expect
+++ b/test/expect/TestScript.test_onnx_raw_export_script_truediv.expect
@@ -11,14 +11,14 @@ ModelProto {
       nodes: [
         Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
         Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "size", inputs: [x,2], outputs: [3], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "_cast_Float", inputs: [3,4], outputs: [5], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "_cast_Float", inputs: [1,6], outputs: [7], attributes: []},
-        Node {type: "div", inputs: [5,7], outputs: [z], attributes: []},
-        Node {type: "Constant", inputs: [], outputs: [9], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
-        Node {type: "add", inputs: [x,z,9], outputs: [10], attributes: []}
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "size", inputs: [x,2], outputs: [4], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "_cast_Float", inputs: [4,5], outputs: [6], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "_cast_Float", inputs: [3,7], outputs: [8], attributes: []},
+        Node {type: "div", inputs: [6,8], outputs: [z], attributes: []},
+        Node {type: "add", inputs: [x,z,1], outputs: [10], attributes: []}
       ]
     }
   opset_import: [OperatorSetIdProto { domain: }],
diff --git a/test/expect/TestScript.test_string_cu.expect b/test/expect/TestScript.test_string_cu.expect
index cefcd07bbba130..d4d96182a30316 100644
--- a/test/expect/TestScript.test_string_cu.expect
+++ b/test/expect/TestScript.test_string_cu.expect
@@ -1,7 +1,7 @@
 graph(%a : Dynamic) {
-  %2 : int = prim::Constant[value=2]()
-  %1 : string = prim::Constant[value="a\n\tb\n"]()
   %3 : string = prim::Constant[value="aa"]()
+  %1 : string = prim::Constant[value="a\n\tb\n"]()
+  %2 : int = prim::Constant[value=2]()
    = prim::Print(%a, %1, %2, %3)
   return (%a);
 }
diff --git a/test/expect/TestScript.test_sum-1.expect b/test/expect/TestScript.test_sum-1.expect
index ce0d1fe0b5d65c..da9ca629e3e109 100644
--- a/test/expect/TestScript.test_sum-1.expect
+++ b/test/expect/TestScript.test_sum-1.expect
@@ -1,7 +1,7 @@
 graph(%x : Dynamic) {
-  %1 : int = prim::Constant[value=4]()
-  %2 : int[] = prim::ListConstruct(%1)
-  %3 : bool = prim::Constant[value=0]()
-  %4 : Dynamic = aten::sum(%x, %2, %3)
+  %1 : bool = prim::Constant[value=0]()
+  %2 : int = prim::Constant[value=4]()
+  %3 : int[] = prim::ListConstruct(%2)
+  %4 : Dynamic = aten::sum(%x, %3, %1)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_sum-2.expect b/test/expect/TestScript.test_sum-2.expect
index a952901438c94f..9a321a23f96080 100644
--- a/test/expect/TestScript.test_sum-2.expect
+++ b/test/expect/TestScript.test_sum-2.expect
@@ -1,7 +1,7 @@
 graph(%x : Double(*, *, *, *, *)) {
-  %1 : int = prim::Constant[value=4]()
-  %2 : int[] = prim::ListConstruct(%1)
-  %3 : bool = prim::Constant[value=0]()
-  %4 : Dynamic = aten::sum(%x, %2, %3)
+  %1 : bool = prim::Constant[value=0]()
+  %2 : int = prim::Constant[value=4]()
+  %3 : int[] = prim::ListConstruct(%2)
+  %4 : Dynamic = aten::sum(%x, %3, %1)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect
index 564b606ecfe5c6..34fdaa019891e1 100644
--- a/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect
+++ b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect
@@ -1,8 +1,8 @@
 graph(%x : Float(*, *)
       %z : Float()) {
-  %2 : int = prim::TensorToNum(%z)
-  %3 : int = prim::Constant[value=1]()
-  %y : Float(*, *) = aten::add(%x, %2, %3)
+  %2 : int = prim::Constant[value=1]()
+  %3 : int = prim::TensorToNum(%z)
+  %y : Float(*, *) = aten::add(%x, %3, %2)
   %5 : Float(*, *) = aten::mul(%x, %y)
   return (%5);
 }
diff --git a/test/expect/TestScript.test_type_cast-test_float_to_int.expect b/test/expect/TestScript.test_type_cast-test_float_to_int.expect
index ffc45ac56552b9..2322f526e2f051 100644
--- a/test/expect/TestScript.test_type_cast-test_float_to_int.expect
+++ b/test/expect/TestScript.test_type_cast-test_float_to_int.expect
@@ -1,7 +1,7 @@
 graph() {
-  %0 : int = prim::Constant[value=1]()
-  %1 : float = prim::Constant[value=5]()
-  %b : int = prim::FloatToInt(%1)
-  %3 : int = aten::add(%b, %0)
+  %0 : float = prim::Constant[value=5]()
+  %1 : int = prim::Constant[value=1]()
+  %b : int = prim::FloatToInt(%0)
+  %3 : int = aten::add(%b, %1)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_type_cast-test_int_to_float.expect b/test/expect/TestScript.test_type_cast-test_int_to_float.expect
index 719cc1e881c594..2fef6a915cc68a 100644
--- a/test/expect/TestScript.test_type_cast-test_int_to_float.expect
+++ b/test/expect/TestScript.test_type_cast-test_int_to_float.expect
@@ -1,7 +1,7 @@
 graph() {
-  %0 : float = prim::Constant[value=1]()
-  %1 : int = prim::Constant[value=2]()
-  %b : float = prim::IntToFloat(%1)
-  %3 : float = aten::add(%b, %0)
+  %0 : int = prim::Constant[value=2]()
+  %1 : float = prim::Constant[value=1]()
+  %b : float = prim::IntToFloat(%0)
+  %3 : float = aten::add(%b, %1)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_weak_script_function.expect b/test/expect/TestScript.test_weak_script_function.expect
index e8e85bd98781fa..6b0502030e202b 100644
--- a/test/expect/TestScript.test_weak_script_function.expect
+++ b/test/expect/TestScript.test_weak_script_function.expect
@@ -1,51 +1,36 @@
 graph(%x.1 : Dynamic) {
-  %x.3 : Dynamic = ^not_a_script_fn()(%x.1)
-  %2 : int = prim::Constant[value=4]()
-  %3 : int = prim::Constant[value=3]()
-  %4 : int = prim::Constant[value=2]()
+  %1 : int = prim::Constant[value=5]()
+  %2 : int = prim::Constant[value=6]()
+  %3 : int = prim::Constant[value=4]()
+  %4 : int = prim::Constant[value=3]()
   %5 : int = prim::Constant[value=2]()
-  %6 : Dynamic = aten::norm(%x.3, %5)
-  %7 : Dynamic = aten::gt(%6, %4)
-  %8 : bool = prim::TensorToBool(%7)
-  %x.4 : Dynamic = prim::If(%8)
+  %6 : int = prim::Constant[value=1]()
+  %x.3 : Dynamic = ^not_a_script_fn()(%x.1)
+  %8 : Dynamic = aten::norm(%x.3, %5)
+  %9 : Dynamic = aten::gt(%8, %5)
+  %10 : bool = prim::TensorToBool(%9)
+  %x.4 : Dynamic = prim::If(%10)
     block0() {
-      %10 : int = prim::Constant[value=1]()
-      %x.2 : Dynamic = aten::add(%x.3, %3, %10)
+      %x.2 : Dynamic = aten::add(%x.3, %4, %6)
       -> (%x.2)
     }
     block1() {
       -> (%x.3)
     }
-  %12 : int = prim::Constant[value=1]()
-  %13 : Dynamic = aten::add(%x.4, %2, %12)
+  %13 : Dynamic = aten::add(%x.4, %3, %6)
   %14 : Dynamic = ^not_a_script_fn()(%x.4)
-  %15 : int = prim::Constant[value=1]()
-  %16 : Dynamic = aten::add(%14, %x.4, %15)
-  %17 : int = prim::Constant[value=1]()
-  %18 : int = prim::Constant[value=1]()
-  %19 : Dynamic = aten::add(%x.4, %17, %18)
-  %20 : int = prim::Constant[value=1]()
-  %21 : Dynamic = aten::add(%16, %19, %20)
-  %22 : int = prim::Constant[value=1]()
-  %x : Dynamic = aten::add(%13, %21, %22)
-  %24 : int = prim::Constant[value=5]()
-  %25 : int = prim::Constant[value=1]()
-  %26 : Dynamic = aten::add(%x, %24, %25)
-  %27 : int = prim::Constant[value=6]()
-  %28 : int = prim::Constant[value=1]()
-  %29 : Dynamic = aten::add(%x, %27, %28)
-  %30 : Dynamic = ^not_a_script_fn()(%x)
-  %31 : int = prim::Constant[value=1]()
-  %32 : Dynamic = aten::add(%29, %30, %31)
-  %33 : int = prim::Constant[value=1]()
-  %34 : Dynamic = aten::add(%26, %32, %33)
-  %35 : int = prim::Constant[value=6]()
-  %36 : int = prim::Constant[value=1]()
-  %37 : Dynamic = aten::add(%x, %35, %36)
-  %38 : Dynamic = ^not_a_script_fn()(%x)
-  %39 : int = prim::Constant[value=1]()
-  %40 : Dynamic = aten::add(%37, %38, %39)
-  %41 : int = prim::Constant[value=1]()
-  %42 : Dynamic = aten::add(%34, %40, %41)
-  return (%42);
+  %15 : Dynamic = aten::add(%14, %x.4, %6)
+  %16 : Dynamic = aten::add(%x.4, %6, %6)
+  %17 : Dynamic = aten::add(%15, %16, %6)
+  %x : Dynamic = aten::add(%13, %17, %6)
+  %19 : Dynamic = aten::add(%x, %1, %6)
+  %20 : Dynamic = aten::add(%x, %2, %6)
+  %21 : Dynamic = ^not_a_script_fn()(%x)
+  %22 : Dynamic = aten::add(%20, %21, %6)
+  %23 : Dynamic = aten::add(%19, %22, %6)
+  %24 : Dynamic = aten::add(%x, %2, %6)
+  %25 : Dynamic = ^not_a_script_fn()(%x)
+  %26 : Dynamic = aten::add(%24, %25, %6)
+  %27 : Dynamic = aten::add(%23, %26, %6)
+  return (%27);
 }
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 923c0d4c874fc0..65c6180bbaf7a8 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -927,6 +927,22 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_where_functional(self):
+        class WhereFunctional(torch.nn.Module):
+            def forward(self, x):
+                return torch.where(x > 2.0, x, torch.neg(x))
+
+        x = torch.randn(3, 4)
+        self.run_model_test(WhereFunctional(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_where_method(self):
+        class WhereMethod(torch.nn.Module):
+            def forward(self, x):
+                return x.where(x > 2.0, torch.neg(x))
+
+        x = torch.randn(3, 4)
+        self.run_model_test(WhereMethod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
 
 # a bit of metaprogramming to set up all the rnn tests
 
diff --git a/test/test_jit.py b/test/test_jit.py
index 4aed6b86dd0697..439727f3f7b0d2 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3075,6 +3075,33 @@ def func(x):
         y2 = torch.sum(x, dim=0)
         self.assertEqual(y, y2)
 
+    def test_constant_pooling(self):
+        def func(cond):
+            a = 1
+            b = 4
+            c = 0
+            d = "abc"
+            e = "bcd"
+            f = "abc"
+            x = torch.ones([2])
+            y = x * 4
+            z = torch.ones([2])
+            if bool(cond):
+                c = b - a
+            else:
+                y = torch.rand(0)
+                if bool(cond):
+                    y = torch.rand(1)
+                print(d, e, f, x, y, z)
+            b = b - a
+            return a, b, c, x
+
+        self.checkScript(func, torch.tensor([1]))
+        graph = torch.jit.script(func).graph
+        self.run_pass('constant_propagation', graph)
+        self.run_pass('constant_pooling', graph)
+        self.assertExpectedGraph(graph)
+
     def test_literal(self):
         def func1(a, b):
             c = a, b
@@ -5393,6 +5420,26 @@ def foo(self):
         self.assertEqual(m_orig.foo(), m_import.foo())
         self.assertTrue(m_orig.foo().dtype == m_import.foo().dtype)
 
+    def test_script_module_export_blocks(self):
+        class M(torch.jit.ScriptModule):
+            def __init__(self, n, m):
+                super(M, self).__init__()
+                self.weight = torch.nn.Parameter(torch.rand(n, m))
+
+            @torch.jit.script_method
+            def forward(self, input):
+                if bool(input.sum() > 0):
+                    output = self.weight.mv(input)
+                else:
+                    output = self.weight + input
+                return output
+
+        m_orig = M(200, 200)
+        m_import = self.getExportImportCopy(m_orig)
+
+        t = torch.rand(200)
+        self.assertEqual(m_orig(t), m_import(t))
+
     def test_script_module_export_shared_storage(self):
         class M(torch.jit.ScriptModule):
 
diff --git a/test/test_nn.py b/test/test_nn.py
index 6768e21140d833..f25650cad1d0e3 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -420,7 +420,7 @@ def apply_fn(input1, input2, *params):
     def test_cuda(self, test_case, dtype=None, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
-                return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad)
+                return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
             elif isinstance(obj, torch.Tensor):
                 return obj.to(dtype)
             elif isinstance(obj, tuple):
@@ -766,14 +766,14 @@ def _test_dropout(self, cls, cuda, input):
         input = input.to(device).fill_(1 - p)
 
         module = cls(p)
-        input_var = torch.tensor(input, requires_grad=True)
+        input_var = input.clone().requires_grad_()
         output = module(input_var)
         self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
         output.backward(input)
         self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
 
         module = cls(p, True)
-        input_var = torch.tensor(input.clone(), requires_grad=True)
+        input_var = input.clone().requires_grad_()
         output = module(input_var + 0)
         self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
         output.backward(input)
@@ -794,7 +794,7 @@ def _test_alpha_dropout(self, cls, input):
 
         for p in [0.2, 0.5, 0.8]:
             module = cls(p)
-            input_var = torch.tensor(input, requires_grad=True)
+            input_var = input.detach().clone().requires_grad_()
             output = module(input_var)
             # output mean should be close to input mean
             self.assertLess(abs(output.data.mean() - mean), 0.1)
@@ -1914,7 +1914,7 @@ def _test_gumbel_softmax_st(self, cuda, dtype=torch.float):
 
         exceed_limits = 0
         for draw in range(num_draws):
-            logits_var = torch.tensor(logits, requires_grad=True)
+            logits_var = logits.detach().requires_grad_()
             y_draw = torch.nn.functional.gumbel_softmax(
                 logits_var,
                 hard=True)
@@ -2222,7 +2222,7 @@ def test_FeatureAlphaDropout(self):
     def _test_InstanceNorm_general(self, cls, input, device="cpu", dtype=torch.float):
         # default case track_running_stats=False
         b, c = input.size(0), input.size(1)
-        input_var = torch.tensor(input, device=device, dtype=dtype, requires_grad=True)
+        input_var = input.to(device=device, dtype=dtype).requires_grad_()
 
         IN = cls(c, eps=0).to(device, dtype)
 
@@ -2538,7 +2538,7 @@ def expected_output(dim):
         module = module_cls(2, return_indices=True).to(device, dtype=dtype)
         numel = 4 ** (num_dim + 1)
         input = torch.arange(1, numel + 1).view(2, 2, *repeat(4, num_dim)).to(device, dtype=dtype)
-        input_var = torch.tensor(input, requires_grad=True)
+        input_var = input.clone().detach().requires_grad_()
 
         # Check forward
         output, indices = module(input_var)
@@ -2680,7 +2680,7 @@ def test_max_pool_nan(self, dtype=torch.float):
         self._test_max_pool_nan(self, device="cpu")
 
     def _test_scatter(self, tensor):
-        x = torch.tensor(tensor, requires_grad=True)
+        x = tensor.detach().requires_grad_()
         result = dp.scatter(x, (0, 1))
         self.assertEqual(len(result), 2)
         self.assertEqual(result[0], x[:2])
@@ -2766,25 +2766,22 @@ def test_gather_different_len_dicts(self):
         with self.assertRaises(ValueError):
             _ = dp.gather(inputs, target_device=0)
 
-    def _test_broadcast_double_backwards(self, *tensors):
-        variables = tuple(torch.tensor(t, requires_grad=True) for t in tensors)
-        _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), variables)
-
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     @skipIfRocm
     def test_broadcast_double_backwards_gpu(self):
-        self._test_broadcast_double_backwards(torch.randn(4, 4).cuda(),
-                                              torch.randn(4, 4).cuda(),
-                                              torch.randn(4, 4).cuda())
+        tensors = (torch.randn(4, 4, device='cuda', requires_grad=True),
+                   torch.randn(4, 4, device='cuda', requires_grad=True),
+                   torch.randn(4, 4, device='cuda', requires_grad=True))
+        _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), tensors)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_broadcast_not_requiring_grad(self):
         variables = [
-            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
-            Variable(torch.randn(1, 2).cuda(), requires_grad=False),
-            Variable(torch.randn(1, 2).cuda(), requires_grad=False),
-            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
-            Variable(torch.randn(1, 2).cuda(), requires_grad=True),
+            torch.randn(1, 2, device='cuda', requires_grad=True),
+            torch.randn(1, 2, device='cuda', requires_grad=False),
+            torch.randn(1, 2, device='cuda', requires_grad=False),
+            torch.randn(1, 2, device='cuda', requires_grad=True),
+            torch.randn(1, 2, device='cuda', requires_grad=True),
         ]
         broadcasted_variables = Broadcast.apply((0, 1), *variables)
         for output_idx, broadcasted_var in enumerate(broadcasted_variables):
@@ -3471,7 +3468,7 @@ def run_test(benchmark):
                 conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to("cuda", dtype)
                 for size in sizes:
                     x = torch.randn(size, device="cuda", dtype=dtype)
-                    out = conv(torch.tensor(x, requires_grad=True))
+                    out = conv(x.detach().clone().requires_grad_())
                     out.backward(torch.ones_like(out))
 
         run_test(benchmark=False)
@@ -3603,7 +3600,7 @@ def test_Conv2d_groups_nobias(self):
     def test_Conv2d_depthwise_naive_groups_cuda(self, dtype=torch.float):
         for depth_multiplier in [1, 2]:
             m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to("cuda", dtype)
-            i = torch.tensor(torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype) / 2, requires_grad=True)
+            i = torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
             output = m(i)
             grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, device="cuda", dtype=dtype) / 2
             output.backward(grad_output)
@@ -3613,14 +3610,14 @@ def test_Conv2d_depthwise_naive_groups_cuda(self, dtype=torch.float):
             m1 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to("cuda", dtype)
             m1.weight.data = m.weight.data[:offset].clone()
             m1.bias.data = m.bias.data[:offset].clone()
-            i1 = torch.tensor(i.data[:, :1].contiguous(), requires_grad=True)
+            i1 = i.detach()[:, :1].clone().requires_grad_()
             output1 = m1(i1)
             output1.backward(grad_output[:, :offset].contiguous())
 
             m2 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to("cuda", dtype)
             m2.weight.data.copy_(m.weight.data[offset:])
             m2.bias.data.copy_(m.bias.data[offset:])
-            i2 = torch.tensor(i.data[:, 1:].contiguous(), requires_grad=True)
+            i2 = i.detach()[:, 1:].clone().requires_grad_()
             output2 = m2(i2)
             output2.backward(grad_output[:, offset:].contiguous())
 
@@ -3863,8 +3860,7 @@ def pad(tensor, length):
         batch_sizes = [sum(map(bool, filter(lambda x: x >= i, lengths))) for i in range(1, max_length + 1)]
         offset = 0
         padded = torch.cat([pad(i * 100 + torch.arange(1., 5 * l + 1).view(l, 1, 5), max_length)
-                            for i, l in enumerate(lengths, 1)], 1)
-        padded = torch.tensor(padded, requires_grad=True)
+                            for i, l in enumerate(lengths, 1)], 1).requires_grad_()
         expected_data = [[torch.arange(1., 6) + (i + 1) * 100 + 5 * n for i in range(batch_size)]
                          for n, batch_size in enumerate(batch_sizes)]
         expected_data = list(itertools.chain.from_iterable(expected_data))
@@ -5003,8 +4999,8 @@ def test_cosine_embedding_loss_margin_no_reduce(self):
                                                                    margin=0.5, reduction='none'))
 
     def test_margin_ranking_loss_no_reduce(self):
-        input1 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
-        input2 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        input1 = torch.randn(15).mul_(10).requires_grad_()
+        input2 = torch.randn(15).mul_(10).requires_grad_()
         target = torch.randn(15).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
             x, y, z, reduction='none'), (input1, input2, target)))
@@ -5012,8 +5008,8 @@ def test_margin_ranking_loss_no_reduce(self):
                          loss_reference_fns['MarginRankingLoss'](input1, input2, target, reduction='none'))
 
     def test_margin_ranking_loss_margin_no_reduce(self):
-        input1 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
-        input2 = torch.tensor(torch.randn(15).mul(10), requires_grad=True)
+        input1 = torch.randn(15).mul_(10).requires_grad_()
+        input2 = torch.randn(15).mul_(10).requires_grad_()
         target = torch.randn(15).sign()
         self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss(
             x, y, z, margin=0.5, reduction='none'), (input1, input2, target)))
diff --git a/third_party/nccl/.gitignore b/third_party/nccl/.gitignore
deleted file mode 100644
index 34a07c2883e81e..00000000000000
--- a/third_party/nccl/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
-/build
diff --git a/third_party/nccl/CMakeLists.txt b/third_party/nccl/CMakeLists.txt
index 695a4d9f93c531..2ac095ff1da620 100644
--- a/third_party/nccl/CMakeLists.txt
+++ b/third_party/nccl/CMakeLists.txt
@@ -11,7 +11,7 @@ string(REPLACE "-gencode;" "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
 message(STATUS "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
 
 ADD_CUSTOM_COMMAND(
-   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/nccl
    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so
    COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j${NUM_JOBS}
 )
diff --git a/third_party/nccl/LICENSE.txt b/third_party/nccl/LICENSE.txt
deleted file mode 100644
index c7efd73e16e3df..00000000000000
--- a/third_party/nccl/LICENSE.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-
- Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in the
-    documentation and/or other materials provided with the distribution.
-  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
-    Laboratory, the U.S. Department of Energy, nor the names of their
-    contributors may be used to endorse or promote products derived
-    from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- The U.S. Department of Energy funded the development of this software
- under subcontract 7078610 with Lawrence Berkeley National Laboratory.
-
diff --git a/third_party/nccl/Makefile b/third_party/nccl/Makefile
deleted file mode 100644
index 96b0719c6ac328..00000000000000
--- a/third_party/nccl/Makefile
+++ /dev/null
@@ -1,260 +0,0 @@
-#
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENCE.txt for license information
-#
-
-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-KEEP ?= 0
-DEBUG ?= 0
-PROFAPI ?= 0
-BUILDDIR ?= build
-BUILDDIR := $(abspath $(BUILDDIR))
-
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-
-CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-
-ifeq ($(CUDA_MAJOR), 7)
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_52,code=compute_52
-else ifeq ($(CUDA_MAJOR), 8)
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60
-else
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60 \
-                -gencode=arch=compute_70,code=compute_70
-endif
-
-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96 -Xfatbin -compress-all
-# Use addprefix so that we can specify more than one path
-LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
-
-# If CUDA < 8.0, add workaround C++ flags
-CUDA_MAJOR_LT_8 := $(shell [ $(CUDA_MAJOR) -lt 8 ] && echo true)
-ifeq ($(CUDA_MAJOR_LT_8), true)
-CXXFLAGS += -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
-endif
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
-else
-NVCUFLAGS += -O0 -G
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
-CXXFLAGS  += -Wall -Wextra
-else
-.SILENT:
-endif
-
-ifneq ($(KEEP), 0)
-NVCUFLAGS += -keep
-endif
-
-ifneq ($(PROFAPI), 0)
-CXXFLAGS += -DPROFAPI
-endif
-
-NCCL_MAJOR   := 1
-NCCL_MINOR   := 3
-NCCL_PATCH   := 5
-CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
-
-CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-
-.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
-.DEFAULT : all
-
-INCEXPORTS  := nccl.h
-LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
-LIBNAME     := libnccl.so
-STATICLIBNAME := libnccl_static.a
-
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-STATICLIBTARGET := $(STATICLIBNAME)
-LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
-LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
-DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
-
-all : lib staticlib
-
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-
-staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
-
--include $(DEPFILES)
-
-$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
-	@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	ar cr $@ $(LIBOBJ)
-
-$(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(INCDIR)
-	cp -f $< $@
-
-$(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
-	@rm -f $(@:%.o=%.d.tmp)
-
-clean :
-	rm -rf $(BUILDDIR)
-
-install : lib
-	mkdir -p $(PREFIX)/lib
-	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-
-
-#### TESTS ####
-
-TEST_ONLY ?= 0
-
-# Tests depend on lib, except in TEST_ONLY mode.
-ifeq ($(TEST_ONLY), 0)
-TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-endif
-
-NCCL_LIB ?= $(LIBDIR)
-NCCL_INC ?= $(INCDIR)
-
-MPI_HOME ?= /usr
-MPI_INC ?= $(MPI_HOME)/include
-MPI_LIB ?= $(MPI_HOME)/lib
-MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-
-TESTS       := all_gather_test     all_gather_scan \
-               all_reduce_test     all_reduce_scan \
-               broadcast_test      broadcast_scan \
-               reduce_test         reduce_scan \
-               reduce_scatter_test reduce_scatter_scan
-MPITESTS    := mpi_test
-
-TSTINC     := -I$(NCCL_INC) -Itest/include
-TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
-TSTDIR     := $(BUILDDIR)/test/single
-MPITSTDIR  := $(BUILDDIR)/test/mpi
-TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
-MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
-
-test : $(TESTBINS)
-
-$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(TSTDIR)
-	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-mpitest : $(MPITESTBINS)
-
-$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(MPITSTDIR)
-	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-#### PACKAGING ####
-
-DEBIANDIR  := $(BUILDDIR)/debian
-
-DEBGEN_IN  := $(shell (cd debian ; ls *.in))
-DEBGEN     := $(DEBGEN_IN:.in=)
-DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
-DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
-
-DEB_REVISION   ?= 1
-DEB_TIMESTAMP  := $(shell date -R)
-DEB_ARCH       ?= amd64
-
-debian : $(DEBTARGETS)
-
-deb : lib debian
-	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
-	mkdir -p $(BUILDDIR)/deb/
-	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
-
-debclean :
-	rm -Rf $(DEBIANDIR)
-
-$(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-35s > %s\n" $< $@
-	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
-	    $< > $@
-
-$(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(DEBIANDIR)
-	cp -f $< $@
-
-#### FORTRAN BINDINGS ####
-
-export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
-
-forlib : lib
-	$(MAKE) -C fortran lib
-fortest : forlib
-	$(MAKE) -C fortran test
-forclean :
-	$(MAKE) -C fortran clean
diff --git a/third_party/nccl/README.md b/third_party/nccl/README.md
deleted file mode 100644
index 17b9546a2a86fe..00000000000000
--- a/third_party/nccl/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-**IMPORTANT NOTE**
-
-**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
-
-**http://developer.nvidia.com/nccl.**
-
-# NCCL
-
-Optimized primitives for collective multi-GPU communication.
-
-## Introduction
-
-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
-[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
-
-## What's inside
-
-At present, the library implements the following collectives:
-- all-reduce
-- all-gather
-- reduce-scatter
-- reduce
-- broadcast
-
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-## Requirements
-
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-## Build & run
-
-To build the library and tests.
-
-```shell
-$ cd nccl
-$ make CUDA_HOME=<cuda install path> test
-```
-
-Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
-
-```shell
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
-$ ./build/test/single/all_reduce_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/single/all_reduce_test 10000000
-# Using devices
-#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out-of-place                    in-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-```
-
-To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
-
-## Usage
-
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
-
-```c
-#include <nccl.h>
-
-typedef struct {
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream_t stream;
-} PerThreadData;
-
-int main(int argc, char* argv[])
-{
-  int nGPUs;
-  cudaGetDeviceCount(&nGPUs);
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) {
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  }
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-}
-```
-
-## Copyright and License
-
-NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
-accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
-rights reserved.
-
diff --git a/third_party/nccl/debian/.gitignore b/third_party/nccl/debian/.gitignore
deleted file mode 100644
index 1e97a9fea8f112..00000000000000
--- a/third_party/nccl/debian/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-/*.debhelper.log
-/*.debhelper
-/*.substvars
-/tmp/
-/files
-/libnccl1/
-/libnccl-dev/
diff --git a/third_party/nccl/debian/changelog.in b/third_party/nccl/debian/changelog.in
deleted file mode 100644
index ad569a0f718731..00000000000000
--- a/third_party/nccl/debian/changelog.in
+++ /dev/null
@@ -1,5 +0,0 @@
-nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
-
-  * Automatic Debian package from build
-
- -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
diff --git a/third_party/nccl/debian/compat b/third_party/nccl/debian/compat
deleted file mode 100644
index ec635144f60048..00000000000000
--- a/third_party/nccl/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/third_party/nccl/debian/control.in b/third_party/nccl/debian/control.in
deleted file mode 100644
index e5ca48ebeff711..00000000000000
--- a/third_party/nccl/debian/control.in
+++ /dev/null
@@ -1,28 +0,0 @@
-Source: nccl
-Section: libs
-Maintainer: cudatools <cudatools@nvidia.com>
-Priority: optional
-Build-depends: debhelper(>=9)
-Standards-Version: 3.9.5
-
-Package: libnccl${nccl:Major}
-Section: libs
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: NVIDIA Collectives Communication Library (NCCL) Runtime
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
-
-Package: libnccl-dev
-Section: libdevel
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
-Description: NVIDIA Collectives Communication Library (NCCL) Development Files
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
diff --git a/third_party/nccl/debian/copyright b/third_party/nccl/debian/copyright
deleted file mode 120000
index 4ab43736a839da..00000000000000
--- a/third_party/nccl/debian/copyright
+++ /dev/null
@@ -1 +0,0 @@
-../LICENSE.txt
\ No newline at end of file
diff --git a/third_party/nccl/debian/libnccl-dev.install b/third_party/nccl/debian/libnccl-dev.install
deleted file mode 100644
index 90299a0beec477..00000000000000
--- a/third_party/nccl/debian/libnccl-dev.install
+++ /dev/null
@@ -1,2 +0,0 @@
-include/nccl.h usr/include
-lib/libnccl.so /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/libnccl-dev.manpages b/third_party/nccl/debian/libnccl-dev.manpages
deleted file mode 100644
index 4bfc2cb22651b6..00000000000000
--- a/third_party/nccl/debian/libnccl-dev.manpages
+++ /dev/null
@@ -1 +0,0 @@
-debian/nccl.7
diff --git a/third_party/nccl/debian/libnccl1.install.in b/third_party/nccl/debian/libnccl1.install.in
deleted file mode 100644
index 73b4c0a9dd0337..00000000000000
--- a/third_party/nccl/debian/libnccl1.install.in
+++ /dev/null
@@ -1,2 +0,0 @@
-lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/nccl.7 b/third_party/nccl/debian/nccl.7
deleted file mode 100644
index 0cb56017339d86..00000000000000
--- a/third_party/nccl/debian/nccl.7
+++ /dev/null
@@ -1,139 +0,0 @@
-.TH NCCL
-.SH NAME
-.PP
-nccl \- Optimized primitives for collective multi\-GPU communication.
-
-.SH Introduction
-.PP
-NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
-
-.SH What's inside
-.PP
-At present, the library implements the following collectives:
-\- all\-reduce
-\- all\-gather
-\- reduce\-scatter
-\- reduce
-\- broadcast
-
-.PP
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-.SH Requirements
-.PP
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
-
-.PP
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-.SH Build & run
-.PP
-To build the library and tests.
-
-.PP
-.RS
-
-.nf
-$ cd nccl
-$ make CUDA\_HOME=<cuda install path> test
-
-.fi
-.RE
-
-.PP
-Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
-
-.PP
-.RS
-
-.nf
-$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
-$ ./build/test/all\_reduce\_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/all\_reduce\_test 10000000
-# Using devices
-#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out\-of\-place                    in\-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-
-.fi
-.RE
-
-.PP
-To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
-
-.SH Usage
-.PP
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
-
-.PP
-.RS
-
-.nf
-#include <nccl.h>
-
-typedef struct \{
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream\_t stream;
-\} PerThreadData;
-
-int main(int argc, char* argv[])
-\{
-  int nGPUs;
-  cudaGetDeviceCount(\&nGPUs);
-  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) \{
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  \}
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-\}
-
-.fi
-.RE
-
-.SH Copyright
-.PP
-All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
-rights reserved.
diff --git a/third_party/nccl/debian/rules b/third_party/nccl/debian/rules
deleted file mode 100755
index 23b90a9e015316..00000000000000
--- a/third_party/nccl/debian/rules
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/make -f
-
-%:
-	dh $@ --parallel
-
-override_dh_auto_install:
-	PREFIX=debian/tmp dh_auto_install
-
-override_dh_auto_test:
-	# Do not make test
-
-override_dh_auto_clean:
-	# Do not make clean
diff --git a/third_party/nccl/debian/shlibs.local.in b/third_party/nccl/debian/shlibs.local.in
deleted file mode 100644
index 82505da4908500..00000000000000
--- a/third_party/nccl/debian/shlibs.local.in
+++ /dev/null
@@ -1 +0,0 @@
-libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
diff --git a/third_party/nccl/debian/source/format b/third_party/nccl/debian/source/format
deleted file mode 100644
index 89ae9db8f88b82..00000000000000
--- a/third_party/nccl/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (native)
diff --git a/third_party/nccl/fortran/Makefile b/third_party/nccl/fortran/Makefile
deleted file mode 100644
index b60b0165b7d7d8..00000000000000
--- a/third_party/nccl/fortran/Makefile
+++ /dev/null
@@ -1,81 +0,0 @@
-FC := gfortran
-FCNAME := $(notdir $(FC))
-
-BUILDDIR ?= ../build
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-LIBNAME    := libncclfor.so
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
-
-LIBCUDAFOR := libcudafor.so
-
-ifneq ($(filter pgf%, $(FCNAME)), )
-# PGI compiler (pgfortran, pgf90, pgf95)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  := -Mpreprocess
-FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
-FCFLAGS     := -fast -O3
-else
-# non-PGI compilers do not have CUDA support, compile our own CUDA lib
-CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
-CUDALINK    := -L$(CUDA_LIB) -lcudart
-CUDAFORLINK := -lcudafor
-ifeq ($(FCNAME), gfortran)
-FCMODFLAGS  := -J$(INCDIR)
-FCPREFLAGS  += -cpp
-FCFLAGS     += -ffree-line-length-none
-else ifeq ($(FCNAME), ifort)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  += -fpp
-endif
-endif
-
-ifeq ($(VERBOSE), 0)
-.SILENT:
-endif
-
-lib: $(CUDAFORDEP)
-	$(MAKE) $(LIBDIR)/$(LIBTARGET)
-
-$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
-	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
-
-$(OBJDIR)/%.o: src/%.f90
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	mkdir -p $(INCDIR)
-	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
-
-TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
-ifneq ($(filter pgf%, $(FCNAME)), )
-TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
-endif
-
-TESTDIR  := $(BUILDDIR)/test/fortran
-TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
-
-test: lib $(TESTBINS)
-
-$(TESTDIR)/%: test/%.f90 lib
-	@printf "Building  %-35s > %s\n" $< $@
-	@mkdir -p $(TESTDIR)
-	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
-
-clean:
-	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
-	rm -rf $(TESTDIR)/
-
diff --git a/third_party/nccl/fortran/src/cudafor.f90 b/third_party/nccl/fortran/src/cudafor.f90
deleted file mode 100644
index 4ecd0f41b83e24..00000000000000
--- a/third_party/nccl/fortran/src/cudafor.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-#ifndef _CUDA
-
-!Start cudaFor module
-module cudaFor
-use iso_c_binding
-implicit none
-private
-public :: c_devptr
-public :: cudaMemcpyKind,           &
-          cudaMemcpyHostToHost,     &
-          cudaMemcpyHostToDevice,   &
-          cudaMemcpyDeviceToHost,   &
-          cudaMemcpyDeviceToDevice, &
-          cudaMemcpyDefault
-public :: cuda_stream_kind
-public :: cudaGetDeviceCount
-public :: cudaSetDevice
-public :: cudaMalloc
-public :: cudaMemcpy
-public :: cudaFree
-public :: cudaStreamCreate
-public :: cudaStreamSynchronize
-public :: cudaStreamDestroy
-
-!Start types
-
-!Start c_devptr
-type, bind(c) :: c_devptr
-type(c_ptr) :: member
-end type c_devptr
-!End c_devptr
-
-!Start cudaMemcpyKind
-type, bind(c) :: cudaMemcpyKind
-integer(c_int) :: member
-end type cudaMemcpyKind
-
-type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
-                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
-                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
-                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
-                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
-!End cudaMemcpyKind
-
-!Start cuda_stream_kind
-integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
-!End cuda_stream_kind
-
-!End types
-
-!Start interfaces
-
-!Start cudaGetDeviceCount
-interface cudaGetDeviceCount
-integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
-import :: c_int
-implicit none
-integer(c_int) :: count
-end function cudaGetDeviceCount
-end interface cudaGetDeviceCount
-!End cudaGetDeviceCount
-
-!Start cudaSetDevice
-interface cudaSetDevice
-integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
-import :: c_int
-implicit none
-integer(c_int), value :: device
-end function cudaSetDevice
-end interface cudaSetDevice
-!End cudaSetDevice
-
-!Start cudaMalloc
-interface cudaMalloc
-integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
-import :: c_int, c_size_t
-import :: c_devptr
-implicit none
-type(c_devptr) :: devPtr
-integer(c_size_t), value :: size
-end function cudaMalloc
-end interface cudaMalloc
-!End cudaMalloc
-
-!Start cudaMemcpy
-interface cudaMemcpy
-
-!Start cudaMemcpyH2D
-integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_devptr), value :: dst
-type(c_ptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyH2D
-!End cudaMemcpyH2D
-
-!Start cudaMemcpyD2H
-integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_ptr), value :: dst
-type(c_devptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyD2H
-!End cudaMemcpyD2H
-
-end interface cudaMemcpy
-!End cudaMemcpy
-
-!Start cudaFree
-interface cudaFree
-integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
-import :: c_int
-import :: c_devptr
-implicit none
-type(c_devptr), value :: devPtr
-end function cudaFree
-end interface cudaFree
-!End cudaFree
-
-!Start cudaStreamCreate
-interface cudaStreamCreate
-integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind) :: pStream
-end function cudaStreamCreate
-end interface cudaStreamCreate
-!End cudaStreamCreate
-
-!Start cudaStreamSynchronize
-interface cudaStreamSynchronize
-integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamSynchronize
-end interface cudaStreamSynchronize
-!End cudaStreamSynchronize
-
-!Start cudaStreamDestroy
-interface cudaStreamDestroy
-integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamDestroy
-end interface cudaStreamDestroy
-!End cudaStreamDestroy
-
-!End interfaces
-
-end module cudaFor
-!End cudaFor module
-
-#endif
diff --git a/third_party/nccl/fortran/src/ncclfor.f90 b/third_party/nccl/fortran/src/ncclfor.f90
deleted file mode 100644
index 2ed4d3d8748d3b..00000000000000
--- a/third_party/nccl/fortran/src/ncclfor.f90
+++ /dev/null
@@ -1,312 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-!Start defines
-#define NCCL_UNIQUE_ID_BYTES 128
-!End defines
-
-!Start ncclFor module
-module ncclFor
-use iso_c_binding
-use cudaFor
-implicit none
-private
-public :: ncclUniqueId
-public :: ncclComm
-public :: ncclResult,                 &
-          ncclSuccess,                &
-          ncclUnhandledCudaError,     &
-          ncclSystemError,            &
-          ncclInternalError,          &
-          ncclInvalidDevicePointer,   &
-          ncclInvalidRank,            &
-          ncclUnsupportedDeviceCount, &
-          ncclDeviceNotFound,         &
-          ncclInvalidDeviceIndex,     &
-          ncclLibWrapperNotSet,       &
-          ncclCudaMallocFailed,       &
-          ncclRankMismatch,           &
-          ncclInvalidArgument,        &
-          ncclInvalidType,            &
-          ncclInvalidOperation,       &
-          nccl_NUM_RESULTS
-public :: ncclDataType, &
-          ncclChar,     &
-          ncclInt,      &
-#ifdef CUDA_HAS_HALF
-          ncclHalf,     &
-#endif
-          ncclFloat,    &
-          ncclDouble,   &
-          ncclInt64,    &
-          ncclUInt64,   &
-          nccl_NUM_TYPES
-public :: ncclRedOp, &
-          ncclSum,   &
-          ncclProd,  &
-          ncclMax,   &
-          ncclMin,   &
-          nccl_NUM_OPS
-public :: ncclGetUniqueId
-public :: ncclCommInitRank
-public :: ncclCommInitAll
-public :: ncclCommCuDevice
-public :: ncclCommUserRank
-public :: ncclCommCount
-public :: ncclCommDestroy
-public :: ncclReduce
-public :: ncclAllReduce
-public :: ncclReduceScatter
-public :: ncclBcast
-public :: ncclAllGather
-
-!Start types
-
-!Start ncclUniqueId
-type, bind(c) :: ncclUniqueId
-character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
-end type ncclUniqueId
-!End ncclUniqueId
-
-!Start ncclComm
-type, bind(c) :: ncclComm
-type(c_ptr) :: member
-end type ncclComm
-!End ncclComm
-
-!Start ncclResult
-type, bind(c) :: ncclResult
-integer(c_int) :: member
-end type ncclResult
-
-type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
-                               ncclUnhandledCudaError     = ncclResult( 1), &
-                               ncclSystemError            = ncclResult( 2), &
-                               ncclInternalError          = ncclResult( 3), &
-                               ncclInvalidDevicePointer   = ncclResult( 4), &
-                               ncclInvalidRank            = ncclResult( 5), &
-                               ncclUnsupportedDeviceCount = ncclResult( 6), &
-                               ncclDeviceNotFound         = ncclResult( 7), &
-                               ncclInvalidDeviceIndex     = ncclResult( 8), &
-                               ncclLibWrapperNotSet       = ncclResult( 9), &
-                               ncclCudaMallocFailed       = ncclResult(10), &
-                               ncclRankMismatch           = ncclResult(11), &
-                               ncclInvalidArgument        = ncclResult(12), &
-                               ncclInvalidType            = ncclResult(13), &
-                               ncclInvalidOperation       = ncclResult(14), &
-                               nccl_NUM_RESULTS           = ncclResult(15)
-!End ncclResult
-
-!Start ncclDataType
-type, bind(c) :: ncclDataType
-integer(c_int) :: member
-end type ncclDataType
-
-type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
-                                 ncclInt        = ncclDataType(1), &
-#ifdef CUDA_HAS_HALF
-                                 ncclHalf       = ncclDataType(2), &
-#endif
-                                 ncclFloat      = ncclDataType(3), &
-                                 ncclDouble     = ncclDataType(4), &
-                                 ncclInt64      = ncclDataType(5), &
-                                 ncclUInt64     = ncclDataType(6), &
-                                 nccl_NUM_TYPES = ncclDataType(7)
-!End ncclDataType
-
-!Start ncclRedOp
-type, bind(c) :: ncclRedOp
-integer(c_int) :: member
-end type ncclRedOp
-
-type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
-                              ncclProd     = ncclRedOp(1), &
-                              ncclMax      = ncclRedOp(2), &
-                              ncclMin      = ncclRedOp(3), &
-                              nccl_NUM_OPS = ncclRedOp(4)
-!End ncclRedOp
-
-!End types
-
-!Start interfaces
-
-!Start ncclGetUniqueId
-interface ncclGetUniqueId
-type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
-import :: ncclResult, ncclUniqueId
-implicit none
-type(ncclUniqueId) :: uniqueId
-end function ncclGetUniqueId
-end interface ncclGetUniqueId
-!End ncclGetUniqueId
-
-!Start ncclCommInitRank
-interface ncclCommInitRank
-type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
-import :: c_int
-import :: ncclResult, ncclUniqueId, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-type(ncclUniqueId), value :: commId
-integer(c_int), value :: rank
-end function ncclCommInitRank
-end interface ncclCommInitRank
-!End ncclCommInitRank
-
-!Start ncclCommInitAll
-interface ncclCommInitAll
-type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-integer(c_int) :: devlist(*)
-end function ncclCommInitAll
-end interface ncclCommInitAll
-!End ncclCommInitAll
-
-!Start ncclCommCuDevice
-interface ncclCommCuDevice
-type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: devid
-end function ncclCommCuDevice
-end interface ncclCommCuDevice
-!End ncclCommCuDevice
-
-!Start ncclCommUserRank
-interface ncclCommUserRank
-type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: rank
-end function ncclCommUserRank
-end interface ncclCommUserRank
-!End ncclCommUserRank
-
-!Start ncclCommCount
-interface ncclCommCount
-type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: count
-end function ncclCommCount
-end interface ncclCommCount
-!End ncclCommCount
-
-!Start ncclCommDestroy
-interface ncclCommDestroy
-subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
-import :: ncclComm
-implicit none
-type(ncclComm), value :: comm
-end subroutine ncclCommDestroy
-end interface ncclCommDestroy
-!End ncclCommDestroy
-
-!Start ncclReduce
-interface ncclReduce
-type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduce
-end interface ncclReduce
-!End ncclReduce
-
-!Start ncclAllReduce
-interface ncclAllReduce
-type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllReduce
-end interface ncclAllReduce
-!End ncclAllReduce
-
-!Start ncclReduceScatter
-interface ncclReduceScatter
-type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: recvcount
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduceScatter
-end interface ncclReduceScatter
-!End ncclReduceScatter
-
-!Start ncclBcast
-interface ncclBcast
-type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: buff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclBcast
-end interface ncclBcast
-!End ncclBcast
-
-!Start ncclAllGather
-interface ncclAllGather
-type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: sendbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(c_devptr), value :: recvbuff
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllGather
-end interface ncclAllGather
-!End ncclAllGather
-
-!End interfaces
-
-end module ncclFor
-!End nccl module
diff --git a/third_party/nccl/fortran/test/allgather_arr_out.f90 b/third_party/nccl/fortran/test/allgather_arr_out.f90
deleted file mode 100644
index 17fbf7a7bce023..00000000000000
--- a/third_party/nccl/fortran/test/allgather_arr_out.f90
+++ /dev/null
@@ -1,162 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl * nDev))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    hostBuff(:, i) = recvBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allgather_ptr_out.f90 b/third_party/nccl/fortran/test/allgather_ptr_out.f90
deleted file mode 100644
index f7d196284a16da..00000000000000
--- a/third_party/nccl/fortran/test/allgather_ptr_out.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allreduce_arr_out.f90 b/third_party/nccl/fortran/test/allreduce_arr_out.f90
deleted file mode 100644
index 50c1b6488a1306..00000000000000
--- a/third_party/nccl/fortran/test/allreduce_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff(:, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allreduce_ptr_out.f90 b/third_party/nccl/fortran/test/allreduce_ptr_out.f90
deleted file mode 100644
index 2c1248f312eb14..00000000000000
--- a/third_party/nccl/fortran/test/allreduce_ptr_out.f90
+++ /dev/null
@@ -1,166 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/broadcast_arr.f90 b/third_party/nccl/fortran/test/broadcast_arr.f90
deleted file mode 100644
index 867fa1aadba7f1..00000000000000
--- a/third_party/nccl/fortran/test/broadcast_arr.f90
+++ /dev/null
@@ -1,137 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: devBuff(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(devBuff(nEl))
-    devBuffPtr(i) = c_devloc(devBuff)
-    devBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    hostBuff(:, i) = devBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    deallocate(devBuff)
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/broadcast_ptr.f90 b/third_party/nccl/fortran/test/broadcast_ptr.f90
deleted file mode 100644
index 963afeed40ff9b..00000000000000
--- a/third_party/nccl/fortran/test/broadcast_ptr.f90
+++ /dev/null
@@ -1,142 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(devBuffPtr(i))
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reduce_arr_out.f90 b/third_party/nccl/fortran/test/reduce_arr_out.f90
deleted file mode 100644
index 17e41b45fe5797..00000000000000
--- a/third_party/nccl/fortran/test/reduce_arr_out.f90
+++ /dev/null
@@ -1,164 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
-  hostBuff(:, nDev + 1) = recvBuff
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reduce_ptr_out.f90 b/third_party/nccl/fortran/test/reduce_ptr_out.f90
deleted file mode 100644
index 777f8ea07685e4..00000000000000
--- a/third_party/nccl/fortran/test/reduce_ptr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_arr_out.f90 b/third_party/nccl/fortran/test/reducescatter_arr_out.f90
deleted file mode 100644
index 6a976dac1f3c19..00000000000000
--- a/third_party/nccl/fortran/test/reducescatter_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl * nDev))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reducescatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_ptr_out.f90 b/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
deleted file mode 100644
index 9df35bffe858fa..00000000000000
--- a/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
+++ /dev/null
@@ -1,174 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reduceScatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
new file mode 160000
index 00000000000000..f93fe9bfd94884
--- /dev/null
+++ b/third_party/nccl/nccl
@@ -0,0 +1 @@
+Subproject commit f93fe9bfd94884cec2ba711897222e0df5569a53
diff --git a/third_party/nccl/src/all_gather.cu b/third_party/nccl/src/all_gather.cu
deleted file mode 100644
index cb36b7179673fb..00000000000000
--- a/third_party/nccl/src/all_gather.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllGatherKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin AllGather steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput == thisOutput) {
-      Prims::Copy(
-          thisInput  + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    } else {
-      Prims::DoubleCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: copy to next GPU
-    if (pushrecv) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring.userRank[1];
-      offset = chunkOffset + rankDest * size;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class AllGather {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
-  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/src/all_reduce.cu b/third_party/nccl/src/all_reduce.cu
deleted file mode 100644
index 2f38d6e5f77935..00000000000000
--- a/third_party/nccl/src/all_reduce.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
-    /////////////// begin AllReduce steps ///////////////
-    int offset;
-    int maxOffset;
-    int slice;
-    int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
-    ALIGN_SIZE(chunkSize, THREADS*UNROLL);
-
-    // step 0: push data to next GPU
-    slice = ring.userRank[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      slice = ring.userRank[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    slice = ring.userRank[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::ReduceCopy(
-        prevInput  + poffset,
-        thisInput  + offset,
-        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-
-    if (pushrecv) {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring.userRank[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class AllReduce {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
-  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/src/broadcast.cu b/third_party/nccl/src/broadcast.cu
deleted file mode 100644
index 3a7cb119cff3e4..00000000000000
--- a/third_party/nccl/src/broadcast.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 4
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void BroadcastKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int rank = ring.userRank[0];
-  const int nextRank = ring.userRank[1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (rank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (nextRank == root) {
-      if (pushrecv) maxOffset = 0; // Only wait for signals
-      Prims::Copy(
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (pushrecv) {
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(
-            prevInput + boffset,
-            thisOutput + offset,
-            nextOutput + boffset,
-	    sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (rank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 256
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingBroadcast(void* buff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks != 1) {
-    KernelArgs<T> args;
-    ArgsSetup(&args, buff, buff, root, count, comm);
-    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class Broadcast {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
-  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
-}
-
diff --git a/third_party/nccl/src/common_coll.h b/third_party/nccl/src/common_coll.h
deleted file mode 100644
index 54050f8e46158a..00000000000000
--- a/third_party/nccl/src/common_coll.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer\n", opname, ptrname);
-    return ncclInvalidDevicePointer;
-  }
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidDevicePointer;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
-    return ncclInvalidRank;
-  }
-  if (type < 0 || type >= nccl_NUM_TYPES) {
-    WARN("%s : invalid type %d\n", opname, type);
-    return ncclInvalidType;
-  }
-  if (op < 0 || op >= nccl_NUM_OPS) {
-    WARN("%s : invalid reduction operation %d\n", opname, op);
-    return ncclInvalidOperation;
-  }
-  if (count < 0) {
-    WARN("%s : invalid count %d\n", opname, count);
-    return ncclInvalidArgument;
-  }
-
-  // Check pointers
-  NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
-  if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
-    // No need to check recvbuff pointer for non-root reduce
-    return ncclSuccess;
-  }
-  NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
-  return ncclSuccess;
-}
-
-// Kernel launch
-template<typename T>
-struct KernelArgs {
-  // general parameters
-  int nRanks;
-  int root;
-  int buffSize;
-  int N;
-  int opIndex;
-  volatile int * __restrict__ opCounter;
-  int * __restrict__ doneCount;
-  bool pushrecv;
-
-  // some pre-computed sizes
-  int SliceSize;
-  int SliceOffset;
-  int ChunkSize;
-  int NumChunks;
-
-  // local and remote input, output, and buffer
-  const T * __restrict__ ThisInput;
-  T * __restrict__ ThisOutput;
-
-  DevRing<char>* ring;
-};
-
-template<typename T>
-void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
-		const int root, const int count, ncclComm *comm) {
-  args->nRanks = comm->nRanks;
-  args->root = root;
-  args->buffSize = comm->buffSize;
-  args->N = count;
-  args->opIndex = comm->opSched;
-  args->opCounter = comm->opCounter;
-  args->ThisInput = (const T*)sendbuff;
-  args->ThisOutput = (T*)recvbuff;
-  args->ring = comm->devRing;
-  args->pushrecv = comm->globalMemSpace;
-}
-
-#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
-		args, stream) do { \
-  dim3 grid(1, 1, 1); \
-  dim3 block(THREADS+1, 1, 1); \
-  void* argptrs[] = {&args}; \
-  CUDACHECK(cudaLaunchKernel( \
-            (void*)K<THREADS, UNROLL, FUNC, T>, \
-            grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
-} while (0)
-
-#endif
diff --git a/third_party/nccl/src/common_kernel.h b/third_party/nccl/src/common_kernel.h
deleted file mode 100644
index b96519f78ab6d1..00000000000000
--- a/third_party/nccl/src/common_kernel.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COMMON_KERNEL_H_
-#define COMMON_KERNEL_H_
-
-#include <cstdio>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-// BAR macro and helpers
-#define WARP_SIZE 32
-#define ROUNDUP(x, y)                                                           \
-    (((((x) + (y) - 1) / (y))) * (y))
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define BAR_EXEC(type, barid, nthreads) \
-    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
-#define BAR_EXPAND(type, barid, nthreads) \
-    BAR_EXEC(type, barid, (nthreads))
-
-// Named barrier macro.
-// Expands to asm("bar.type barid, nthreads") where
-// nthreads has been rounded up to WARP_SIZE.
-#define BAR(type, barid, nthreads) \
-    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-#ifdef CUDA_HAS_HALF
-#if CUDART_VERSION < 9000
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  half r;
-  r.x = ptr->x;
-  return r;
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  ptr->x = val.x;
-}
-#else
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  return *((half*)ptr);
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  *((half*)ptr) = val;
-}
-#endif
-#endif
-
-__device__ unsigned int spinct;
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-__device__ inline void Wait(const FUNC& func) {
-  while (!func()) {
-    // waste time
-    atomicInc(&spinct, 10);
-  }
-}
-
-typedef uint64_t PackType;
-
-// unpack x and y to elements of type T and apply FUNC to each element
-template<class FUNC, typename T>
-struct MULTI {
-  __device__ PackType operator()(const PackType x, const PackType y) const;
-};
-
-template<class FUNC>
-struct MULTI<FUNC, char> {
-  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
-      "PackType must be twice the size of uint32_t.");
-  union converter {
-    PackType storage;
-    struct {
-      uint32_t a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    // for char, we do these as vector ops
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, int> {
-  static_assert(sizeof(PackType) == 2 * sizeof(int),
-      "PackType must be twice the size of int.");
-  union converter {
-    PackType storage;
-    struct {
-      int a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<class FUNC>
-struct MULTI<FUNC, half> {
-  static_assert(sizeof(PackType) == 4 * sizeof(half),
-      "PackType must be four times the size of half.");
-
-  struct PackHalf2 {
-    half2 a, b;
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    struct PackHalf2 cx, cy, cr;
-    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
-    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return *(reinterpret_cast<PackType*>(&cr));
-  }
-};
-#endif
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, double> {
-  static_assert(sizeof(PackType) == sizeof(double),
-      "PackType must be the same size as double.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
-    return __double_as_longlong(rv);
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, unsigned long long> {
-  static_assert(sizeof(PackType) == sizeof(unsigned long long),
-      "PackType must be the same size as unsigned long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    unsigned long long rv = FUNC()(x, y);
-    return rv;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, long long> {
-  static_assert(sizeof(PackType) == sizeof(long long),
-      "PackType must be the same size as long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    long long rv = FUNC()((long long)x, (long long)y);
-    return rv;
-  }
-};
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int idx) {
-  T val = vFetch(src0+idx);
-  if (TWO_INPUTS) {
-    val = FUNC()(val, vFetch(src1+idx));
-  }
-  vStore(dest0+idx, val);
-  if (TWO_OUTPUTS) {
-    vStore(dest1+idx, val);
-  }
-}
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
-__device__ inline void ReduceCopy64b(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int offset) {
-  PackType t0[UNROLL];
-  PackType t1[UNROLL];
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
-    if (TWO_INPUTS) {
-      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
-    }
-  }
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
-    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
-    if (TWO_OUTPUTS) {
-      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
-    }
-  }
-}
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
-    bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  if (N<=0) {
-    return;
-  }
-
-  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
-      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
-      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = N;
-  }
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  for (int idx = tid; idx < Npreamble; idx += THREADS) {
-    // ought to be no way this is ever more than one iteration, except when
-    // alignable is false
-    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-  }
-
-  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 64-bit alignable.
-  if (alignable) {
-    const int PackFactor = sizeof(PackType) / sizeof(T);
-    int Nrem = N - Npreamble;
-    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
-
-    // stage 2a: main loop
-    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
-        * (UNROLL * THREADS); // round down
-
-    #pragma unroll 1 // don't unroll this loop
-    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2a = Nalign2a * PackFactor;
-    Nrem -= Ndone2a;
-
-    // stage 2b: slightly less optimized for section when we don't have full
-    // UNROLLs
-
-    int Nalign2b = Nrem / PackFactor;
-
-    #pragma unroll 4
-    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2b = Nalign2b * PackFactor;
-    Nrem -= Ndone2b;
-    int Ndone2 = Ndone2a + Ndone2b;
-    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
-    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }
-
-    // stage 2c: tail
-
-    for (int idx = tid; idx < Nrem; idx += THREADS) {
-      // never ought to make it more than one time through this loop.  only a
-      // few threads should even participate
-      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-    }
-  } // done fast path
-}
-
-template <typename T>
-__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
-  // increment comm's operation counts
-  __threadfence_system(); // Technically need to ensure that cleared flags
-  // are visible before incrementing op counter.
-  *args->opCounter = args->opIndex+1;
-}
-
-template <int THREADS, typename T> __device__ __forceinline__
-void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
-  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
-  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
-  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
-  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
-  long long* lldst = reinterpret_cast<long long*>(dst);
-  const long long* llsrc = reinterpret_cast<const long long*>(src);
-  if (threadIdx.x < NUM_WORDS) {
-    lldst[threadIdx.x] = llsrc[threadIdx.x];
-  }
-}
-
-
-#endif // COMMON_KERNEL_H_
diff --git a/third_party/nccl/src/copy_kernel.h b/third_party/nccl/src/copy_kernel.h
deleted file mode 100644
index 0f69748fac7358..00000000000000
--- a/third_party/nccl/src/copy_kernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-
-#include "common_kernel.h"
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template <>
-struct FuncPassA<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    return x;
-  }
-  __device__ half operator()(const half x, const half y) const {
-    return x;
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
-      dest, nullptr, src, nullptr, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
-      dest0, dest1, src, nullptr, N);
-}
-
-#endif // COPY_KERNEL_H_
diff --git a/third_party/nccl/src/core.cu b/third_party/nccl/src/core.cu
deleted file mode 100644
index 1420d21c31de76..00000000000000
--- a/third_party/nccl/src/core.cu
+++ /dev/null
@@ -1,1019 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "core.h"
-#include "libwrap.h"
-#include "common_coll.h"
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <string.h>
-#include <errno.h>
-
-DebugLevel ncclDebugLevel;
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  pid_t pid = getpid();
-  static int count = 0;
-  int commId = __sync_fetch_and_add(&count, 1);
-  int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId);
-  if(strlen(out->internal) < len) {
-    WARN("ncclUniqueId truncated");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-
-static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) {
-  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
-  if (fd == -1) {
-    WARN("shm_open failed to open %s", shmname);
-    return ncclSystemError;
-  }
-
-  if (ftruncate(fd, bytes) == -1) {
-    WARN("ftruncate failed to allocate %ld bytes", bytes);
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  if (*ptr == MAP_FAILED) {
-    WARN("failure in mmap");
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  close(fd);
-  return ncclSuccess;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
-  if(shm_unlink(shmname) == -1) {
-    WARN("smh_unlink failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-static ncclResult_t shmUnmap(void* ptr, size_t bytes) {
-  if(munmap(ptr, bytes) == -1) {
-    WARN("munmap failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-
-typedef struct {
-  int rank;
-  int ndev;
-  int cudaDev;
-  int sortId;
-  pid_t pid;
-  ncclMem* hostptr;
-  ncclMem* devptr;
-  cudaIpcMemHandle_t devipc;
-  size_t buffSize;
-} RankEntry;
-
-static int compRanks(const void* a, const void* b) {
-  const RankEntry* A = (const RankEntry*)a;
-  const RankEntry* B = (const RankEntry*)b;
-  if (A->sortId < B->sortId) return -1;
-  if (A->sortId > B->sortId) return  1;
-  return 0;
-}
-
-static void orderRanks(RankEntry* ranks, int count) {
-  qsort(ranks, count, sizeof(RankEntry), compRanks);
-}
-
-
-typedef struct {
-  union {
-    struct {
-      volatile int bar;
-      int globalMemSpaceBroke;
-    };
-    char pad[16];
-   };
-   RankEntry ranks[1];
-} RankGather;
-
-static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
-    int ndev, int rank, RankEntry myInfo) {
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  RankGather* tmp = NULL;
-  int bar_tmp;
-
-  ncclResult_t res = shmOpen(commId.internal, bytes, (void**)&tmp);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to open shm segment for gather", rank);
-    return res;
-  }
-
-  tmp->ranks[rank] = myInfo;
-
-  bar_tmp = tmp->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    if (bar_tmp == ndev-1) { // everyone is done
-      ncclResult_t res = shmUnlink(commId.internal);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unlink shm segment for gather", rank);
-        shmUnmap(tmp, bytes);
-        return res;
-      }
-
-      orderRanks(tmp->ranks, ndev);
-    }
-    swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (tmp->bar < ndev)
-    sched_yield();
-  __sync_synchronize();
-
-  *gather = tmp;
-  return ncclSuccess;
-}
-
-static void syncRingDirect(RankGather* gather, int* globalMemSpaceOk) {
-  int bar_tmp = gather->bar - 1;
-  int ndev = gather->ranks[0].ndev;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
-    sched_yield();
-  __sync_synchronize();
-
-  *globalMemSpaceOk = gather->globalMemSpaceBroke ? 0 : 1;
-}
-
-static ncclResult_t closeGather(RankGather* gather, int ndev) {
-  int bar_tmp = gather->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
-    sched_yield();
-  __sync_synchronize();
-
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  ncclResult_t res = shmUnmap(gather, bytes);
-  if (res != ncclSuccess) {
-    WARN("failed to unmap %ld bytes of gather", bytes);
-    return res;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMalloc((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte device buffer", size);
-    return ncclCudaMallocFailed;
-  }
-  if (cudaMemset(*ptr, 0, size) != cudaSuccess) {
-    WARN("failed to memset device buffer.");
-    cudaFree(*ptr);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static const int ShmMapped = 1;
-static const int ShmLinked = 2;
-
-static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMallocHost((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte host buffer", size);
-    return ncclSystemError;
-  }
-  memset(*ptr, 0, size);
-  return ncclSuccess;
-}
-
-static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  ncclResult_t res = shmOpen(shmname, size, (void**)ptr);
-  if (res != ncclSuccess) {
-    WARN("failed to allocate %lu byte shm buffer", size);
-    *ptr = NULL;
-    return res;
-  }
-
-  if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) {
-    WARN("failed to register host buffer");
-    shmUnlink(shmname);
-    shmUnmap(*ptr, size);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) {
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev);
-  if (res == cudaErrorInvalidDevice) {
-    WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev);
-    return ncclInvalidDeviceIndex;
-  } else if (res != cudaSuccess) {
-    WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-  INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId);
-
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-    WARN("rank %d failed to get nvml handle for device %s", rank, busId);
-    return ncclUnhandledCudaError;
-  }
-  // Order by nvml index
-  if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->sortId) != ncclSuccess) {
-    WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-
-  info->rank = rank;
-  info->ndev = comm->nRanks;
-  info->cudaDev = comm->cudaDev;
-  info->pid = getpid();
-  info->buffSize = comm->buffSize;
-  info->hostptr = comm->hostMem;
-  info->devptr = comm->devMem;
-  if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
-    WARN("rank %d failed to open CUDA IPC handle", rank);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t commClearMaps(ncclComm_t comm) {
-  ncclResult_t res, retval = ncclSuccess;
-  cudaError_t cures;
-
-  for(int d=0; d<comm->nRanks; ++d) {
-    if (comm->ptrs[d].hostCleanup != NULL) {
-      cures = cudaHostUnregister(comm->ptrs[d].hostCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to unregister handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-      res = shmUnmap(comm->ptrs[d].hostCleanup, offsetof(ncclMem, buff) + comm->buffSize);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unmap handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? res : retval;
-      }
-      comm->ptrs[d].hostCleanup = NULL;
-    }
-
-    if (comm->ptrs[d].devCleanup != NULL) {
-      cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].devCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to close IPC handle to device %d: %s",
-          comm->rank, d, cudaGetErrorString(cures));
-        retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-    }
-  }
-
-  if (comm->userFromRing != NULL)
-    memset(comm->userFromRing, 0, sizeof(int)*comm->nRanks);
-  if (comm->ncclFromRing != NULL)
-    memset(comm->ncclFromRing, 0, sizeof(int)*comm->nRanks);
-
-  if (comm->devUserFromRing != NULL) {
-    cures = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nRanks);
-    if (cures != cudaSuccess) {
-      WARN("Faild to clear dev map: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-
-  if (comm->devRing != NULL) {
-    cures = cudaMemset(comm->devRing, 0, sizeof(DevRing<char>));
-    if (cures != cudaSuccess) {
-      WARN("Failed to clear devRing: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-  return retval;
-}
-
-static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* globalMemSpaceBroke) {
-  int ndev = comm->nRanks;
-  comm->rank = rank;
-
-  if (ndev > MAXRANKS) {
-    WARN("%d ranks exceeds MAXRANKS of %d", ndev, MAXRANKS);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  // Check for inconsistencies between ranks
-  // If two ranks use the same rank, then one slot of
-  // ranks[] will be left unset with zero ndev/buffSize.
-  for(int i=0; i<ndev; ++i) {
-    if (ranks[i].buffSize != comm->buffSize
-        || ranks[i].ndev != comm->nRanks) {
-      commClearMaps(comm);
-      return ncclRankMismatch;
-    }
-  }
-
-  // Find self among ranks of gather
-  int myNcclId = -1;
-  for (int i=0; i<ndev; ++i) {
-    if(ranks[i].rank == rank) {
-      myNcclId = i;
-      break;
-    }
-  }
-  if (myNcclId == -1) {
-    WARN("rank %d not found in communicator", rank);
-    return ncclInvalidRank;
-  }
-
-  for(int ringPos=0; ringPos<ndev; ++ringPos) {
-    int ncclPos = (ringPos+myNcclId) % ndev; // ring order relative to self
-    int userRank = ranks[ncclPos].rank;
-    comm->userFromRing[ringPos] = userRank;
-    comm->ncclFromRing[ringPos] = ncclPos;
-  }
-
-  int myDev = ranks[myNcclId].cudaDev;
-  pid_t myPid = ranks[myNcclId].pid;
-
-  for (int i=0; i<ndev; ++i) {
-    int iRank = ranks[i].rank;
-    int iDev = ranks[i].cudaDev;
-    pid_t iPid = ranks[i].pid;
-    int canpeer = 0;
-
-    int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
-
-    if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
-      INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
-        rank, myDev, iRank, iDev);
-      canpeer = 0;
-    }
-
-    cudaError_t err;
-    ncclMem* remoteHostBuff;
-
-    comm->ptrs[i].type = NodeRef::HOST; // Assume host buffer
-    comm->ptrs[i].devCleanup = NULL;
-    comm->ptrs[i].hostCleanup = NULL;
-
-    if (iPid == myPid) {
-      remoteHostBuff = ranks[i].hostptr;
-
-      if (myDev == iDev) { // shared device
-        INFO("rank access %d -> %d via common device", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      } else if (canpeer) {
-        INFO("rank access %d -> %d via P2P device mem", rank, iRank);
-        err = cudaDeviceEnablePeerAccess(iDev, 0);
-        if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          cudaGetLastError();
-        } else if (err != cudaSuccess) {
-          WARN("rank %d failed to peer with device %d: %s",
-              rank, iDev, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      }
-    } else { // Separate processes
-      *globalMemSpaceBroke = 1;
-      char rankname[1024];
-      sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
-      if (openHostMemShm(rankname, &remoteHostBuff, ranks[i].buffSize)
-          != ncclSuccess) {
-        WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      comm->ptrs[i].hostCleanup = remoteHostBuff;
-
-      // TODO: Extend to same device (MPS) case.
-      // At present that would go through host mem.
-      if (canpeer) {
-        INFO("rank access %d -> %d via IPC device mem", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local  = ranks[myNcclId].devptr;
-        err = cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
-            ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess);
-        if (err != cudaSuccess) {
-          WARN("rank %d failed to open Ipc handle to rank %d: %s",
-              rank, iRank, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].devCleanup = comm->ptrs[i].remote;
-      }
-    }
-
-    err = cudaHostGetDevicePointer(&comm->ptrs[i].opCounter,
-          &(remoteHostBuff->opCounter), 0);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to obtain %d's zero copy pointer: %s",
-          rank, iRank, cudaGetErrorString(err));
-      commClearMaps(comm);
-      return ncclUnhandledCudaError;
-    }
-
-    if (comm->ptrs[i].type == NodeRef::HOST) {
-      *globalMemSpaceBroke = 1;
-      INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myNcclId].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to map zero copy buffer to device", rank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, remoteHostBuff, 0) != cudaSuccess) {
-        WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-    }
-  }
-
-  // Setup device-side ring view
-  if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  DevRing<char> ringTemp;
-  memcpy(ringTemp.userRank, comm->userFromRing, ndev*sizeof(int));
-
-  int prevIdx = comm->ncclFromRing[comm->nRanks-1];
-  int nextIdx = comm->ncclFromRing[1 % comm->nRanks];
-  NodeRef* prevPtrs = comm->ptrs+prevIdx;
-  NodeRef* nextPtrs = comm->ptrs+nextIdx;
-
-  ringTemp.prevOpCounter    = prevPtrs->opCounter;
-  ringTemp.nextOpCounter    = nextPtrs->opCounter;
-  ringTemp.sendFlagToNext   = nextPtrs->remote->flags;
-  ringTemp.recvFlagFromPrev = prevPtrs->local->flags;
-  ringTemp.sendFlagToPrev   = prevPtrs->remote->flags+1;
-  ringTemp.recvFlagFromNext = nextPtrs->local->flags+1;
-
-  ringTemp.recvPtrFromNext = (char**)&nextPtrs->local->recvPtrs;
-  ringTemp.sendPtrToPrev   = (char**)&prevPtrs->remote->recvPtrs;
-
-  ringTemp.recvBuffer = prevPtrs->local->buff;
-  ringTemp.sendBuffer = nextPtrs->remote->buff;
-
-  if (cudaMemcpy(comm->devRing, &ringTemp, sizeof(ringTemp),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy ring maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-static void initDebug() {
-  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NONE;
-  } else if (strcmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = VERSION;
-  } else if (strcmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = WARN;
-  } else if (strcmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = INFO;
-    INFO("NCCL debug level set to INFO");
-  } else if (strcmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = ABORT;
-    INFO("NCCL debug level set to ABORT");
-  }
-}
-
-static void commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  if (comm->doneEvent != NULL)
-    if (cudaEventDestroy(comm->doneEvent) != cudaSuccess)
-      INFO("ncclComm failed to destroy doneEvent");
-
-  ncclResult_t res = commClearMaps(comm);
-  if (res != ncclSuccess)
-    INFO("failed to cleanup comm maps");
-
-  if (comm->devRing != NULL)
-    if (cudaFree(comm->devRing) != cudaSuccess)
-      INFO("commFree failed to free devRing");
-
-  if (comm->userFromRing != NULL)
-    free(comm->userFromRing);
-
-  if (comm->devUserFromRing != NULL)
-    if (cudaFree(comm->devUserFromRing) != cudaSuccess)
-      INFO("commFree failed to free dev maps");
-
-  if (comm->ncclFromRing != NULL)
-    free(comm->ncclFromRing);
-
-  if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess)
-    INFO("Failed to free devMap");
-
-  if (comm->hostMem != NULL) {
-    if (comm->hostMemState & ShmMapped) {
-      if (cudaHostUnregister(comm->hostMem) != cudaSuccess)
-        INFO("Failed to unregister hostMem");
-      size_t size = offsetof(ncclMem, buff) + comm->buffSize;
-      if (shmUnmap(comm->hostMem, size) != ncclSuccess)
-        INFO("Failed to unmap hostMem");
-      comm->hostMemState ^= ShmMapped;
-    } else {
-      cudaFreeHost(comm->hostMem);
-    }
-  }
-  free(comm);
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) {
-  size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(NodeRef);
-  struct ncclComm* comm = (struct ncclComm*)malloc(commBytes);
-  if (comm == NULL) {
-    WARN("comm allocation failed");
-    return ncclSystemError;
-  }
-  memset(comm, 0, commBytes);
-
-  comm->nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
-
-  const char* str = getenv("NCCL_BUFFSIZE");
-  int buffsize;
-  if (str != NULL) {
-    errno = 0;
-    buffsize = strtol(str, NULL, 10);
-    if (errno == ERANGE || buffsize == 0) {
-      INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu",
-          rank, str, DEFAULT_BUFFER_SIZE_BYTES);
-      buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-    }
-  } else {
-    buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-  }
-  comm->buffSize = buffsize;
-  INFO("rank %d using buffSize = %lu", rank, comm->buffSize);
-
-
-  ncclResult_t res;
-  res = allocDevMem(&comm->devMem, comm->buffSize);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate device buffer", rank);
-    commFree(comm);
-    return res;
-  }
-
-  if (cudaMalloc(&comm->devRing, sizeof(DevRing<char>)) != cudaSuccess) {
-    WARN("rank %d failed to allocate device-side ring views", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess ) {
-    WARN("rank %d failed to allocated device maps", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  comm->userFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->userFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  comm->ncclFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->ncclFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  if (cudaEventCreateWithFlags(&comm->doneEvent, cudaEventDisableTiming) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to create doneEvent", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  if(commId == NULL) {
-    comm->hostMemState = 0;
-    res = allocHostMem(&comm->hostMem, comm->buffSize);
-  } else {
-    char rankname[1024];
-    sprintf(rankname, "%s-%d", commId->internal, rank);
-    res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate host buffer", rank);
-      commFree(comm);
-      return res;
-    }
-    comm->hostMemState = ShmMapped | ShmLinked;
-  }
-
-  if (cudaHostGetDevicePointer(&comm->opCounter, &comm->hostMem->opCounter, 0) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to map opCounter to device", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommUpdate(ncclComm_t comm) {
-  // Copy the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMemcpy(comm->devComm, comm, commBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to copy device comm");
-    return ncclUnhandledCudaError;
-  }
-  // Fix the host pointer to be accessible from the device
-  void* dptr;
-  if (cudaHostGetDevicePointer(&dptr, comm->hostMem, 0) != cudaSuccess) {
-    WARN("failed to get device pointer for host mem");
-    return ncclUnhandledCudaError;
-  }
-  if (cudaMemcpy(&comm->devComm->hostMem, &dptr, sizeof(dptr), cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to update host pointer");
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMalloc(&comm->devComm, commBytes) != cudaSuccess) {
-    WARN("failed to allocated device comm");
-    return ncclCudaMallocFailed;
-  }
-  return devCommUpdate(comm);
-}
-
-static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) {
-  char rankname[1024];
-  sprintf(rankname, "%s-%d", commId.internal, rank);
-  if (comm->hostMemState & ShmLinked)
-    comm->hostMemState ^= ShmLinked;
-  return shmUnlink(rankname);
-}
-
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= VERSION) {
-    printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
-    fflush(stdout);
-    shown = 1;
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
-  if (myrank == 0) showVersion();
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-  if (myrank >= ndev || myrank < 0) {
-    WARN("Invalid rank %d, should be in the range 0..%d", myrank, ndev-1);
-    return ncclInvalidRank;
-  }
-
-  if (strlen(commId.internal) < 1 ||
-      strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
-    WARN("rank %d invalid commId", myrank);
-    return ncclInvalidArgument;
-  }
-
-  initDebug();
-  ncclResult_t res;
-  RankEntry myStuff;
-  RankGather* gath = NULL;
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to initialize nvml", myrank);
-    return res;
-  }
-
-  res = commAlloc(newcomm, ndev, &commId, myrank);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate communicator", myrank);
-    return res;
-  }
-
-  res = populateRankInfo(&myStuff, myrank, *newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to obtain rank info", myrank);
-    goto cleanup;
-  }
-
-  res = initGather(&gath, commId, ndev, myrank, myStuff);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to gather rank info", myrank);
-    goto cleanup;
-  }
-
-  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
-  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to build comm maps", myrank);
-    goto cleanup;
-  }
-
-  INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");
-
-  res = closeGather(gath, ndev); // includes a barrier
-  gath = NULL;
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to close gather", myrank);
-    goto cleanup;
-  }
-
-  res = devCommSetup(*newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to copy dcomm", myrank);
-    goto cleanup;
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (gath != NULL)
-    closeGather(gath, ndev);
-  commFree(*newcomm);
-
-  final:
-  if ((*newcomm)->hostMemState & ShmLinked) {
-    if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess)
-      INFO("rank %d failed to unlink host mem shm segment", myrank);
-  }
-
-  if (wrapNvmlShutdown() != ncclSuccess)
-    INFO("rank %d did not shutdown nvml properly", myrank);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  initDebug();
-
-  showVersion();
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  RankEntry* ranks = NULL;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  int affinity_set = 0;
-  int globalMemSpaceBroke = 0; // Assume direct access to recv ptr OK
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  cudaGetDevice(&savedDevice);
-  ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry));
-  if (ranks == NULL) {
-    WARN("NCCL allocation failed");
-    return ncclSystemError;
-  }
-  memset(ranks, 0, ndev*sizeof(RankEntry));
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("nccl failed to initialize nvml");
-    return res;
-  }
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = (devlist == NULL) ? rank : devlist[rank];
-    if (cudaSetDevice(cudaDev) != cudaSuccess) {
-      WARN("rank %d failed to set cuda device %d", rank, cudaDev);
-      res = ncclInvalidDeviceIndex;
-      goto cleanup;
-    }
-
-    // Set CPU affinity
-    affinity_set = 0;
-    if (cudaDeviceGetPCIBusId(busId, 13, cudaDev) != cudaSuccess) {
-      INFO("rank %d failed to get PCI Bus Id for device %d", rank, cudaDev);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to get nvml handle for device %s", rank, busId);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceSetCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to set affinity", rank);
-      goto skipaffinity;
-    }
-    affinity_set = 1;
-    skipaffinity:
-
-    res = commAlloc(&comm, ndev, NULL, rank);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate communicator", rank);
-      goto cleanup;
-    }
-    comms[rank] = comm;
-
-    if (affinity_set && wrapNvmlDeviceClearCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d set but failed to clear cpu affinity", rank);
-    }
-    res = populateRankInfo(ranks+rank, rank, comm);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to obtain rank info", rank);
-      goto cleanup;
-    }
-  }
-
-  orderRanks(ranks, ndev);
-  for(rank=0; rank<ndev; ++rank) {
-    comm = comms[rank];
-    cudaSetDevice(comm->cudaDev);
-    res = commBuildMaps(comm, NULL, rank, ranks, &globalMemSpaceBroke);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to build comm maps", rank);
-      goto cleanup;
-    }
-  }
-
-  INFO("Global device memory space is %s", (globalMemSpaceBroke) ? "disabled" : "enabled");
-  for(rank=0; rank<ndev; ++rank) {
-    comms[rank]->globalMemSpace = globalMemSpaceBroke ? 0 : 1;
-  }
- 
-  for(rank=0; rank<ndev; ++rank) {
-    res = devCommSetup(comms[rank]);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to copy dcomm", rank);
-      goto cleanup;
-    }
-  }
-
-  free(ranks);
-  ranks = NULL;
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (ranks != NULL)
-    free(ranks);
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-  final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO("NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
-  return res;
-}
-
-NCCL_API(void, ncclCommDestroy, ncclComm_t comm);
-void ncclCommDestroy(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  int savedDevice;
-  cudaGetDevice(&savedDevice);
-  int commDevice = comm->cudaDev;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice), void());
-  }
-
-  commFree(comm);
-
-  if (savedDevice != commDevice)
-    cudaSetDevice(savedDevice);
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-  case ncclSuccess                : return "no error";
-  case ncclUnhandledCudaError     : return "unhandled cuda error";
-  case ncclSystemError            : return "system error";
-  case ncclInternalError          : return "internal error";
-  case ncclInvalidDevicePointer   : return "invalid device pointer";
-  case ncclInvalidRank            : return "invalid rank";
-  case ncclUnsupportedDeviceCount : return "unsupported device count";
-  case ncclDeviceNotFound         : return "device not found";
-  case ncclInvalidDeviceIndex     : return "invalid device index";
-  case ncclLibWrapperNotSet       : return "lib wrapper not initialized";
-  case ncclCudaMallocFailed       : return "cuda malloc failed";
-  case ncclRankMismatch           : return "parameter mismatch between ranks";
-  case ncclInvalidArgument        : return "invalid argument";
-  case ncclInvalidType            : return "invalid data type";
-  case ncclInvalidOperation       : return "invalid reduction operations";
-  }
-  return "unknown result code";
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
-
diff --git a/third_party/nccl/src/core.h b/third_party/nccl/src/core.h
deleted file mode 100644
index 17794d7873bb63..00000000000000
--- a/third_party/nccl/src/core.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef CORE_H_
-#define CORE_H_
-
-
-#include "nccl.h"
-#include <cstdio>
-#include <cuda_runtime.h>
-
-#define MAXRANKS 32
-#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
-#define NCCL_MEM_PAD_ALIGN 65536
-
-
-struct ncclMem {
-  union { // Pad this block so that devBuff is correctly aligned
-    struct {
-      int   flags[2];
-      void* recvPtrs;
-      int   opCounter; // Used to determine when remote Communicators are ready.
-                       // Only used in host memory.
-    };
-    char pad[NCCL_MEM_PAD_ALIGN];
-  };
-  // devBuff will be bigger ; we only use its offset/address.
-  char buff[1];
-};
-
-template <typename T>
-struct alignas(long long) DevRing {
-  volatile int* __restrict__ prevOpCounter;
-  volatile int* __restrict__ nextOpCounter;
-  volatile int* __restrict__ sendFlagToNext;
-  volatile int* __restrict__ sendFlagToPrev;
-  volatile int* __restrict__ recvFlagFromNext;
-  volatile int* __restrict__ recvFlagFromPrev;
-
-  T* volatile * __restrict__ recvPtrFromNext;
-  T* volatile * __restrict__ sendPtrToPrev;
-  T*   __restrict__ recvBuffer;
-  T*   __restrict__ sendBuffer;
-
-  int userRank[MAXRANKS];
-};
-
-struct NodeRef {
-  ncclMem* remote; // TODO: Verify if these
-  ncclMem* local;  //       are still needed.
-  enum {DEVICE, HOST} type;
-  ncclMem* devCleanup;  // Used only when remote comm uses same process & GPU
-  ncclMem* hostCleanup; // Used whenever target is in different process
-  int* opCounter; // TODO: see if this can be removed too.
-};
-
-
-struct ncclComm {
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  // Device and Host allocated chunks. Stored here to correctly free() memory.
-  ncclMem* devMem;
-  ncclMem* hostMem;
-  int hostMemState;
-  int opSched; // Scheduling operation index
-  int* opCounter; // Counter of completed operations
-
-  cudaStream_t prevStream; // cache last used stream
-  cudaEvent_t doneEvent; // orders operations in different streams
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userFromRing;
-
-  // copy of the above stored on each device
-  int* devUserFromRing;
-
-  // Ring order
-  int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
-
-  // Size of temp buffer in bytes.
-  size_t buffSize;
-
-  // Whether we have remote access to the recvbuff pointers passed from remote
-  // GPUs. In single process mode this can be used as long as QPI links are
-  // not present. In multi-process, we never push to a remote recvbuff.
-  int globalMemSpace;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;  // TODO: Remove this if not useful
-
-  // Device-side ring view
-  DevRing<char>* devRing;
-
-  // Device-to-device communication structures to access remote or local device
-  // memory. Actual allocation larger than 1.
-  NodeRef ptrs[1];
-};
-
-
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
-extern DebugLevel ncclDebugLevel;
-
-#define WARN(...) do {                                           \
-  if (ncclDebugLevel >= WARN) {                                  \
-    printf("WARN %s:%d ", __FILE__, __LINE__);                   \
-    printf(__VA_ARGS__);                                         \
-    printf("\n");                                                \
-    fflush(stdout);                                              \
-    if (ncclDebugLevel >= ABORT) abort();                        \
-  }                                                              \
-} while(0)
-
-#define INFO(...) do {                                           \
-  if (ncclDebugLevel >= INFO) {                                  \
-    printf("INFO "); printf(__VA_ARGS__); printf("\n");          \
-    fflush(stdout);                                              \
-  }                                                              \
-} while(0)
-
-// Check CUDA calls
-#define CUDACHECK(cmd, retcode) do {                        \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
-        return retcode;                                     \
-    }                                                       \
-} while(false)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    return res; \
-  } \
-} while (0);
-
-#ifdef PROFAPI
-#define NCCL_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
-#else
-#define NCCL_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
-#endif // end PROFAPI
-
-
-#endif // end include guard
-
diff --git a/third_party/nccl/src/enqueue.h b/third_party/nccl/src/enqueue.h
deleted file mode 100644
index 43d570efee3a04..00000000000000
--- a/third_party/nccl/src/enqueue.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef enqueue_h_
-#define enqueue_h_
-
-#include "core.h"
-#include "reduce_kernel.h"
-
-/* Syncronize previous collective (if in different stream) and enqueue
- * collective. Work is performed asynchronously with the host thread.
- * The ColFunc class should be templated on the datatype and reduction
- * operator (if applicable) and define a static entry() method as
- * follows.
- *   template <typename T, template <typename> class RedOp>
- *   class CollectiveFunctor {
- *     public:
- *     static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
- *         int root, ncclComm* comm, cudaStream_t stream);
- *   };
- * The entry() method can assume that the appropriate cuda device has been set. */
-template< template<typename, template<typename> class> class ColFunc,
-          typename T,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  if (stream != comm->prevStream) { // sync required for calls in different streams
-    comm->prevStream = stream;
-    CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
-  }
-
-  ncclResult_t ret;
-  ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
-
-  // Always have to record done event because we don't know what stream next
-  // collective will be in.
-  CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
-  comm->opSched += 1;
-  return ret;
-}
-
-
-// This version decodes type
-template< template<typename, template<typename> class> class ColFunc,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(type) {
-  case ncclChar:
-    return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt:
-    return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#ifdef CUDA_HAS_HALF
-  case ncclHalf:
-    return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#endif
-  case ncclFloat:
-    return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclDouble:
-    return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt64:
-    return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclUint64:
-    return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  default:
-    WARN("Invalid ncclType %d", type);
-    return ncclInvalidType;
-  }
-}
-
-// This version decodes both type and reduction op
-template< template<typename, template<typename> class> class ColFunc>
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     ncclRedOp_t op,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(op) {
-  case ncclSum:
-    return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclProd:
-    return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMax:
-    return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMin:
-    return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
-  default:
-    WARN("Invalid ncclRedOp: %d", op);
-    return ncclInvalidOperation;
-  }
-}
-
-#endif // End include guard
-
diff --git a/third_party/nccl/src/libwrap.cu b/third_party/nccl/src/libwrap.cu
deleted file mode 100644
index 1ac19a62384c2d..00000000000000
--- a/third_party/nccl/src/libwrap.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "libwrap.h"
-#include <dlfcn.h>
-#include "core.h"
-
-int symbolsLoaded = 0;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-
-ncclResult_t wrapSymbols(void) {
-
-  if (symbolsLoaded)
-    return ncclSuccess;
-
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
-  if (!nvmlhandle) {
-    nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-    if (!nvmlhandle) {
-      WARN("Failed to open libnvidia-ml.so[.1]");
-      goto teardown;
-    }
-  }
-
-  #define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-
-  symbolsLoaded = 1;
-  return ncclSuccess;
-
-  teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  return ncclSystemError;
-}
-
-
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
diff --git a/third_party/nccl/src/libwrap.h b/third_party/nccl/src/libwrap.h
deleted file mode 100644
index cdce480415467a..00000000000000
--- a/third_party/nccl/src/libwrap.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-// Dynamically handle dependencies on external libraries (other than cudart).
-
-#ifndef SRC_LIBWRAP_H_
-#define SRC_LIBWRAP_H_
-
-#include "core.h"
-
-/* Extracted from nvml.h */
-typedef struct nvmlDevice_st* nvmlDevice_t;
-
-typedef enum nvmlReturn_enum
-{
-    NVML_SUCCESS = 0,                   //!< The operation was successful
-    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
-    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
-    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
-    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
-    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
-    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
-    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
-    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
-    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
-    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
-    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
-    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
-    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
-    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
-    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
-    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
-    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
-    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
-    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
-    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
-} nvmlReturn_t;
-/* End of nvml.h */
-
-ncclResult_t wrapSymbols(void);
-
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-#endif // End include guard
-
diff --git a/third_party/nccl/src/nccl.h b/third_party/nccl/src/nccl.h
deleted file mode 100644
index 7bb5aa52bc0642..00000000000000
--- a/third_party/nccl/src/nccl.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_H_
-#define NCCL_H_
-
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION >= 7050
-#include <cuda_fp16.h>
-#define CUDA_HAS_HALF 1
-#else
-#undef CUDA_HAS_HALF
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Opaque handle to communicator */
-typedef struct ncclComm* ncclComm_t;
-
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
-
-/* Error type */
-typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidDevicePointer    =  4,
-               ncclInvalidRank             =  5,
-               ncclUnsupportedDeviceCount  =  6,
-               ncclDeviceNotFound          =  7,
-               ncclInvalidDeviceIndex      =  8,
-               ncclLibWrapperNotSet        =  9,
-               ncclCudaMallocFailed        = 10,
-               ncclRankMismatch            = 11,
-               ncclInvalidArgument         = 12,
-               ncclInvalidType             = 13,
-               ncclInvalidOperation        = 14,
-               nccl_NUM_RESULTS            = 15 } ncclResult_t;
-
-/* Generates a unique Id with each call. Used to generate commId for
- * ncclCommInitAll. uniqueId will be created in such a way that it is
- * guaranteed to be unique accross the host. */
-ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
-ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
-
-/* Creates a new communicator (multi process version).
- * rank must be between 0 and ndev-1 and unique within a communicator clique.
- * ndev is number of logical devices
- * The communicator is created on the current CUDA device.
- * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
- * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
-ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-
-/* Creates a clique of communicators.
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-
-/* Frees resources associated with communicator object. */
-void  ncclCommDestroy(ncclComm_t comm);
-void pncclCommDestroy(ncclComm_t comm);
-
-/* Returns nice error message. */
-const char*  ncclGetErrorString(ncclResult_t result);
-const char* pncclGetErrorString(ncclResult_t result);
-
-/* Sets count to number of devices in the communicator clique. */
-ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
-ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
-
-/* Returns cuda device number associated with communicator. */
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
-ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
-
-/* Returns user-ordered "rank" assocaiated with communicator. */
-ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
-ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
-
-/* Reduction opperation selector */
-typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
-/* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-#ifdef CUDA_HAS_HALF
-               ncclHalf       = 2,
-#endif
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
-
-/* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
- * recvbuf may be NULL on all calls except for root device.
- * On the root device, sendbuff and recvbuff are assumed to reside on
- * the same device.
- * Must be called separately for each communicator in communicator clique.
-*/
-ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data arrays of length count in sendbuff using op operation, and leaves
- * identical copies of result on each GPUs recvbuff.
- * Sendbuff and recvbuff are assumed to reside on the same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data in sendbuff using op operation and leaves reduced result scattered
- * over the devices so that recvbuff on the i-th GPU will contain the i-th block of
- * the result. Sendbuff and recvbuff are assumed to reside on same device. Assumes
- * sendbuff has size at least ndev*recvcount elements, where ndev is number of
- * communicators in communicator clique
- * Must be called separately for each communicator in communicator clique.*/
-ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-
-/* Copies count values from root to all other devices.
- * Root specifies the source device in user-order
- * (see ncclCommInit).
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-
-
-/* Each device gathers count values from other GPUs.
- * Result is ordered by comm's logical device order.
- * Assumes recvbuff has size at least ndev*count, where ndev is number of communicators
- * in communicator clique.
- * Sendbuff and recvbuff are assumed to reside on same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-
-/* The following collective operations are not implemented yet */
-///* Gather count values from each device to recvbuff.
-// * Result is ordered by comm's logical device order.
-// * recvbuff may be NULL for all calls except for root device.
-// * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
-// * Must be called separately for each communicator in communicator clique. */
-// * All GPUs, including root, perform copies into recvbuff.
-//ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-
-///* Root device scatters count values to each devices.
-// * sendbuff may be NULL on all devices except a single root
-// * device where it is assumed to have size at least nGPUs*count.
-// * recvbuff allocated on each gpu, including root, size=count.
-// * Result is ordered by comm's logical device order.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//
-///* All GPUs scatter blocks of count elements to other devices.
-// * Must be called separately for each device in the ncclComm.
-// * sendbuff and recvbuff assumed to reside on same device and
-// * have size at least nGPUs*count.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-#ifdef __cplusplus
-} // end extern "C"
-#endif
-
-#endif // end include guard
-
diff --git a/third_party/nccl/src/primitives.h b/third_party/nccl/src/primitives.h
deleted file mode 100644
index bcaeca8f901edd..00000000000000
--- a/third_party/nccl/src/primitives.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef PRIMITIVES_H_
-#define PRIMITIVES_H_
-
-#include <type_traits>
-#include "copy_kernel.h" // for FuncPassA
-#include "reduce_kernel.h" // for reduction funcs
-
-
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  WaitFlag(volatile int * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(int val) { while (*flag < (val + shift)) /*SPIN*/; }
-};
-
-
-class PostFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  PostFlag(volatile int* const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void post(int val) { *flag = (val + shift); }
-};
-
-
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
-
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
-
-
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
-
-
-// Post all PostFlags, ingnore WaitFlags
-__device__ __forceinline__
-void PostToFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
-
-
-// Create pointer arithmetic syntax that doesn't break for nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
-
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
-
-
-// Implementation of primitive types
-template <int THREADS, int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
-  private:
-  template <typename SRC2_T, // either T* or nullptr_t
-            typename DST2_T, // either T* or nullptr_t
-            typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
-  GenericOp(const T*     src1,
-            const SRC2_T src2,
-                  T*     dst1,
-                  DST2_T dst2,
-            int len, int maxoffset, int step, SYNC_Ts... flags) {
-
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or nullptr_t");
-
-    using OpType = typename std::conditional<noSrc2, FuncPassA<T>, REDOP>::type;
-
-    if (threadIdx.x < THREADS) {
-      int sliceSize = len / SUBSTEPS;
-      int sliceOffset = 0;
-      #pragma unroll 1
-      for (int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (threadIdx.x == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-          asm volatile ("bar.sync 1, %0;" :: "r"(THREADS));
-        }
-        ReduceOrCopy
-            <
-             UNROLL,
-             THREADS,
-             OpType,
-             T,
-             !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-             !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-            >
-            (
-             threadIdx.x,
-             ptradd(dst1, sliceOffset),
-             ptradd(dst2, sliceOffset),
-             ptradd(src1, sliceOffset),
-             ptradd(src2, sliceOffset),
-             min(sliceSize, maxoffset-sliceOffset)
-            );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-        }
-        sliceOffset += sliceSize;
-      }
-    } else {
-      for(int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
-        }
-      }
-    }
-  }
-
-  public:
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Copy(const T* src, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  DoubleCopy(const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Reduce(const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  ReduceCopy(const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst1, dst2, len, maxOffset, step, flags...);
-  }
-};
-
-#endif // end include guard
diff --git a/third_party/nccl/src/reduce.cu b/third_party/nccl/src/reduce.cu
deleted file mode 100644
index 721518373f3429..00000000000000
--- a/third_party/nccl/src/reduce.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int rank = ring.userRank[0];
-  const int prevRank = ring.userRank[nranks-1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (prevRank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (rank == root) {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (prevRank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduce(const void* sendbuff, void* recvbuff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, root, count, comm);
-    LAUNCH_KERNEL(ReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class ReduceFunctor {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingReduce<RedOp<T>, T>(sendbuff, recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, root, comm, "Reduce"));
-  return enqueue<ReduceFunctor>(sendbuff, recvbuff, count, datatype, op, root, comm, stream);
-}
-
diff --git a/third_party/nccl/src/reduce_kernel.h b/third_party/nccl/src/reduce_kernel.h
deleted file mode 100644
index f2cd512f5ce42f..00000000000000
--- a/third_party/nccl/src/reduce_kernel.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef REDUCE_KERNEL_H_
-#define REDUCE_KERNEL_H_
-
-#include "common_kernel.h"
-#include <limits>
-
-template<typename T>
-struct FuncNull {
-  __device__ T operator()(const T x, const T y) const {
-    return 0;
-  }
-};
-
-template<typename T>
-struct FuncSum {
-  __device__ T operator()(const T x, const T y) const {
-    return x + y;
-  }
-};
-
-template<typename T>
-struct FuncProd {
-  __device__ T operator()(const T x, const T y) const {
-    return x * y;
-  }
-};
-
-template<typename T>
-struct FuncMax {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? y : x;
-  }
-};
-
-template<typename T>
-struct FuncMin {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? x : y;
-  }
-};
-
-template<>
-struct FuncSum<char> {
-  union converter {
-    uint32_t storage;
-    char4 a;
-  };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x+y;
-  }
-};
-
-template<>
-struct FuncProd<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300)
-    int32_t rv, zero=0;
-    asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-        " vmad.u32.u32.u32 t3, %1.b3, %2.b3, %3;\n\t"
-        " vmad.u32.u32.u32 t2, %1.b2, %2.b2, %3;\n\t"
-        " shl.b32          t3, t3, 16;\n\t"
-        " shl.b32          t2, t2, 16;\n\t"
-        " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-        " shl.b32          t1, t1, 8;\n\t"
-        " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-        " and.b32          t1, t1, 0xff00ff00;\n\t"
-        " and.b32          t0, t0, 0x00ff00ff;\n\t"
-        " or.b32           %0,  t0, t1;\n\t"
-        "}" : "=r"(rv) : "r"(x), "r"(y), "r"(zero));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x * cy.a.x;
-    cr.a.y = cx.a.y * cy.a.y;
-    cr.a.z = cx.a.z * cy.a.z;
-    cr.a.w = cx.a.w * cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x*y;
-  }
-};
-
-template<>
-struct FuncMax<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = max(cx.a.x, cy.a.x);
-    cr.a.y = max(cx.a.y, cy.a.y);
-    cr.a.z = max(cx.a.z, cy.a.z);
-    cr.a.w = max(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x>y) ? x : y;
-  }
-};
-
-template<>
-struct FuncMin<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = min(cx.a.x, cy.a.x);
-    cr.a.y = min(cx.a.y, cy.a.y);
-    cr.a.z = min(cx.a.z, cy.a.z);
-    cr.a.w = min(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x<y) ? x : y;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<>
-struct FuncSum<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x + fy.x;
-    fr.y = fx.y + fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd(x, y);
-#else
-    return __float2half( __half2float(x) + __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncProd<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x * fy.x;
-    fr.y = fx.y * fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul(x, y);
-#else
-    return __float2half( __half2float(x) * __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncMax<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fmaxf(fx.x, fy.x);
-    fr.y = fmaxf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fmaxf(fx, fy);
-    return __float2half(fm);
-  }
-};
-
-template<>
-struct FuncMin<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fminf(fx.x, fy.x);
-    fr.y = fminf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fminf(fx, fy);
-    return __float2half(fm);
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void Reduce(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, false, true>(threadIdx.x, dest,
-      nullptr, src0, src1, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void ReduceAndCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, true, true>(threadIdx.x, dest0, dest1,
-      src0, src1, N);
-}
-
-#endif // REDUCE_KERNEL_H_
diff --git a/third_party/nccl/src/reduce_scatter.cu b/third_party/nccl/src/reduce_scatter.cu
deleted file mode 100644
index b1100dd0c07cc1..00000000000000
--- a/third_party/nccl/src/reduce_scatter.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceScatterKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin ReduceScatter steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[nranks-1];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring.userRank[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Reduce(
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduceScatter(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(ReduceScatterKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class ReduceScatter {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingReduceScatter<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, "ReduceScatter"));
-  return enqueue<ReduceScatter>(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/test/include/test_utilities.h b/third_party/nccl/test/include/test_utilities.h
deleted file mode 100644
index c194205b4c96f9..00000000000000
--- a/third_party/nccl/test/include/test_utilities.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-
-#ifndef SRC_TEST_UTILITIES_H_
-#define SRC_TEST_UTILITIES_H_
-
-#include <curand.h>
-#include <cerrno>
-#include <string>
-
-#define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
-    printf("Cuda failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
-    printf("NCCL failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed);
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op);
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N);
-
-#define CURAND_CHK(cmd)                                                         \
-    do {                                                                        \
-      curandStatus_t error = (cmd);                                             \
-      if (error != CURAND_STATUS_SUCCESS) {                                     \
-        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
-        exit(EXIT_FAILURE);                                                     \
-      }                                                                         \
-    } while (false)
-
-
-template<typename T>
-void GenerateRandom(curandGenerator_t generator, T * const dest,
-    const int N);
-
-template<>
-void GenerateRandom<char>(curandGenerator_t generator, char * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest,
-      N * sizeof(char) / sizeof(int)));
-}
-
-template<>
-void GenerateRandom<int>(curandGenerator_t generator, int * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
-}
-
-template<>
-void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniform(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateLongLong(generator, dest, N));
-}
-
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-  GenerateRandom<T>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-#ifdef CUDA_HAS_HALF
-__global__ void halve(const float * src, half* dest, int N) {
-  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
-      tid < N; tid += blockDim.x * gridDim.x)
-    dest[tid] = __float2half(src[tid]);
-}
-
-template<>
-void Randomize<half>(half* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-
-  float* temp;
-  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
-  GenerateRandom<float>(gen, temp, N);
-  halve<<<128, 512>>>(temp, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaFree(temp));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-#endif
-
-void makeRandom(void* ptr, int n, ncclDataType_t type, int seed) {
-  if (type == ncclChar)
-    Randomize<char>((char*)ptr, n, seed);
-  else if (type == ncclInt)
-    Randomize<int>((int*)ptr, n, seed);
-#ifdef CUDA_HAS_HALF
-  else if (type == ncclHalf)
-    Randomize<half>((half*)ptr, n, seed);
-#endif
-  else if (type == ncclFloat)
-    Randomize<float>((float*)ptr, n, seed);
-  else if (type == ncclDouble)
-    Randomize<double>((double*)ptr, n, seed);
-  else if (type == ncclInt64)
-    Randomize<long long>((long long*)ptr, n, seed);
-  else if (type == ncclUint64)
-    Randomize<unsigned long long>((unsigned long long*)ptr, n, seed);
-
-  return;
-}
-
-template<typename T, int OP> __global__ static
-void accumKern(T* acum, const T* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    T c = contrib[i];
-    T a = acum[i];
-    if(OP == ncclSum) {
-      acum[i] = a+c;
-    } else if(OP == ncclProd) {
-      acum[i] = a*c;
-    } else if(OP == ncclMax) {
-      acum[i] = (a > c) ? a : c;
-    } else if(OP == ncclMin) {
-      acum[i] = (a < c) ? a : c;
-    }
-  }
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __global__
-void accumKern<half, ncclSum>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a + c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclProd>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a * c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMax>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a>c) ? a : c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMin>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a<c) ? a : c );
-  }
-}
-#endif
-
-template<typename T>
-void accVecType(void* out, void* in, int n, ncclRedOp_t op) {
-  switch(op) {
-    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
-    default:
-      printf("Unknown reduction operation.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op) {
-
-  T* devdest;
-  CUDACHECK(cudaHostRegister(dest, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devdest, dest, 0));
-  accVecType<T>((void*)devdest, (void*)contrib, N, op);
-  CUDACHECK(cudaHostUnregister(dest));
-}
-
-void accVec(void* out, void* in, int n, ncclDataType_t type, ncclRedOp_t op) {
-  switch (type) {
-    case ncclChar:   accVecType<char>               (out, in, n, op); break;
-    case ncclInt:    accVecType<int>                (out, in, n, op); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   accVecType<half>               (out, in, n, op); break;
-#endif
-    case ncclFloat:  accVecType<float>              (out, in, n, op); break;
-    case ncclDouble: accVecType<double>             (out, in, n, op); break;
-    case ncclInt64:  accVecType<long long>          (out, in, n, op); break;
-    case ncclUint64: accVecType<unsigned long long> (out, in, n, op); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(const T* A, const T* B, int N, double* max) {
-  __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
-  double locmax = 0.0;
-  for(int i=tid; i<N; i+=blockDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax )
-      locmax = delta;
-  }
-
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N) {
-  T* devexp;
-  double maxerr;
-  double* devmax;
-  CUDACHECK(cudaHostRegister((void*)expected, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer((void**)&devexp, (void*)expected, 0));
-  CUDACHECK(cudaHostRegister((void*)&maxerr, sizeof(double), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devmax, &maxerr, 0));
-  deltaKern<T, 512><<<1, 512>>>(results, devexp, N, devmax);
-  CUDACHECK(cudaHostUnregister(&maxerr));
-  CUDACHECK(cudaHostUnregister((void*)devexp));
-  return maxerr;
-}
-
-void maxDiff(double* max, void* first, void* second, int n, ncclDataType_t type, cudaStream_t s) {
-  switch (type) {
-    case ncclChar:   deltaKern<char, 512>              <<<1,512,0,s>>>((char*)first, (char*)second, n, max); break;
-    case ncclInt:    deltaKern<int, 512>               <<<1,512,0,s>>>((int*)first, (int*)second, n, max); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   deltaKern<half, 512>              <<<1,512,0,s>>>((half*)first, (half*)second, n, max); break;
-#endif
-    case ncclFloat:  deltaKern<float, 512>             <<<1,512,0,s>>>((float*)first, (float*)second, n, max); break;
-    case ncclDouble: deltaKern<double, 512>            <<<1,512,0,s>>>((double*)first, (double*)second, n, max); break;
-    case ncclInt64:  deltaKern<long long, 512>         <<<1,512,0,s>>>((long long*)first, (long long*)second, n, max); break;
-    case ncclUint64: deltaKern<unsigned long long, 512><<<1,512,0,s>>>((unsigned long long*)first, (unsigned long long*)second, n, max); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-std::string TypeName(const ncclDataType_t type) {
-  switch (type) {
-    case ncclChar:   return "char";
-    case ncclInt:    return "int";
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return "half";
-#endif
-    case ncclFloat:  return "float";
-    case ncclDouble: return "double";
-    case ncclInt64:  return "int64";
-    case ncclUint64: return "uint64";
-    default:         return "unknown";
-  }
-}
-
-std::string OperationName(const ncclRedOp_t op) {
-  switch (op) {
-    case ncclSum:  return "sum";
-    case ncclProd: return "prod";
-    case ncclMax:  return "max";
-    case ncclMin:  return "min";
-    default:       return "unknown";
-  }
-}
-
-ncclDataType_t strToType(const char* s) {
-  if (strcmp(s, "char") == 0)
-    return ncclChar;
-  if (strcmp(s, "int") == 0)
-    return ncclInt;
-#ifdef CUDA_HAS_HALF
-  if (strcmp(s, "half") == 0)
-    return ncclHalf;
-#endif
-  if (strcmp(s, "float") == 0)
-    return ncclFloat;
-  if (strcmp(s, "double") == 0)
-    return ncclDouble;
-  if (strcmp(s, "int64") == 0)
-    return ncclInt64;
-  if (strcmp(s, "uint64") == 0)
-    return ncclUint64;
-
-  return nccl_NUM_TYPES;
-}
-
-size_t wordSize(ncclDataType_t type) {
-  switch(type) {
-    case ncclChar:   return sizeof(char);
-    case ncclInt:    return sizeof(int);
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return sizeof(short);
-#endif
-    case ncclFloat:  return sizeof(float);
-    case ncclDouble: return sizeof(double);
-    case ncclInt64:  return sizeof(long long);
-    case ncclUint64: return sizeof(unsigned long long);
-  }
-
-  return 0;
-}
-
-double deltaMaxValue(ncclDataType_t type, bool is_reduction) {
-  if (is_reduction) {
-    switch(type) {
-#ifdef CUDA_HAS_HALF
-      case ncclHalf:   return 5e-2;
-#endif
-      case ncclFloat:  return 1e-5;
-      case ncclDouble: return 1e-12;
-    }
-  }
-  return 1e-200;
-}
-
-ncclRedOp_t strToOp(const char* s) {
-  if (strcmp(s, "sum") == 0)
-    return ncclSum;
-  if (strcmp(s, "prod") == 0)
-    return ncclProd;
-  if (strcmp(s, "max") == 0)
-    return ncclMax;
-  if (strcmp(s, "min") == 0)
-    return ncclMin;
-
-  return nccl_NUM_OPS;
-}
-
-int strToPosInt(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return 0;
-  return (int)temp;
-}
-
-int strToNonNeg(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return -1;
-  return (int)temp;
-}
-
-#endif // SRC_TEST_UTILITIES_H_
diff --git a/third_party/nccl/test/mpi/mpi_test.cu b/third_party/nccl/test/mpi/mpi_test.cu
deleted file mode 100644
index fea6ae599c80b2..00000000000000
--- a/third_party/nccl/test/mpi/mpi_test.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <stdio.h>
-
-#include "nccl.h"
-#include "mpi.h"
-#include "test_utilities.h"
-
-#define SIZE 128
-#define NITERS 1
-
-int main(int argc, char *argv[]) {
-  ncclUniqueId commId;
-  int size, rank;
-  ncclResult_t ret;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  if (argc < size) {
-    if (rank == 0)
-      printf("Usage : %s <GPU list per rank>\n", argv[0]);
-    exit(1);
-  }
-
-  int gpu = atoi(argv[rank+1]);
-
-  // We have to set our device before NCCL init
-  CUDACHECK(cudaSetDevice(gpu));
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // NCCL Communicator creation
-  ncclComm_t comm;
-  NCCLCHECK(ncclGetUniqueId(&commId));
-  MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
-  ret = ncclCommInitRank(&comm, size, commId, rank);
-  if (ret != ncclSuccess) {
-    printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
-    exit(1);
-  }
-
-  // CUDA stream creation
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-  // Initialize input values
-  int *dptr;
-  CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int)));
-  int *val = (int*) malloc(SIZE*sizeof(int));
-  for (int v=0; v<SIZE; v++) {
-    val[v] = rank + 1;
-  }
-  CUDACHECK(cudaMemcpy(dptr, val, SIZE*sizeof(int), cudaMemcpyHostToDevice));
-
-  // Compute final value
-  int ref = size*(size+1)/2;
-
-  // Run allreduce
-  int errors = 0;
-  for (int i=0; i<NITERS; i++) {
-    NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
-  }
-
-  // Check results
-  CUDACHECK(cudaStreamSynchronize(stream));
-  CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
-  for (int v=0; v<SIZE; v++) {
-    if (val[v] != ref) {
-      errors++;
-      printf("[%d] Error at %d : got %d instead of %d\n", rank, v, val[v], ref);
-    }
-  }
-  CUDACHECK(cudaFree(dptr));
-
-  MPI_Allreduce(MPI_IN_PLACE, &errors, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD);
-  if (rank == 0) {
-    if (errors)
-      printf("%d errors. Test FAILED.\n", errors);
-    else
-      printf("Test PASSED.\n");
-  }
-
-  MPI_Finalize();
-  ncclCommDestroy(comm);
-  return errors ? 1 : 0;
-}
diff --git a/third_party/nccl/test/single/all_gather_scan.cu b/third_party/nccl/test/single/all_gather_scan.cu
deleted file mode 100644
index becf315b57faee..00000000000000
--- a/third_party/nccl/test/single/all_gather_scan.cu
+++ /dev/null
@@ -1,239 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_input = n_max * word;
-  size_t max_output = max_input * gpus;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_output));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t out_bytes = word * n * gpus;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int slice=0; slice<gpus; ++slice) {
-      void* refSlice = (void*)((char*)refout + slice*max_input);
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        void* mySlice = (void*)((char*)output[g] + slice*n*word);
-        maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
-      }
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        CUDACHECK(cudaStreamSynchronize(stream[g]));
-        max_error = max(max_error, *localError[g]);
-      }
-    }
-
-    double mb = (double)(n*word * (gpus-1)) * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, max_error, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_gather_test.cu b/third_party/nccl/test/single/all_gather_test.cu
deleted file mode 100644
index 40d2f31fb23a97..00000000000000
--- a/third_party/nccl/test/single/all_gather_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(nDev * N * sizeof(T));
-  T* result = (T*)malloc(nDev * N * sizeof(T));
-  memset(buffer, 0, nDev * N * sizeof(T));
-  memset(result, 0, nDev * N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, nDev * N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-
-    CUDACHECK(cudaMemcpy(result + i * N, sendbuff[i], N * sizeof(T),
-        cudaMemcpyDeviceToHost));
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
-        (void*)recvbuff[i], comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-  //for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s", (int)(n * sizeof(T)), n, TypeName(type).c_str());
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
-          s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 * (double)(nDev - 1)
-        / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, nDev*N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, nDev * N * sizeof(T)));
-  }
-
-  RunTest<T>(sendbuff, recvbuff, N, type, comms, dList);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllGather with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g=0; g<nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_reduce_scan.cu b/third_party/nccl/test/single/all_reduce_scan.cu
deleted file mode 100644
index f93a09986e6751..00000000000000
--- a/third_party/nccl/test/single/all_reduce_scan.cu
+++ /dev/null
@@ -1,247 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    CUDACHECK(cudaMalloc(&output[g], max_size));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError[g], output[g], refout, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_reduce_test.cu b/third_party/nccl/test/single/all_reduce_test.cu
deleted file mode 100644
index 1935a38fa716d3..00000000000000
--- a/third_party/nccl/test/single/all_reduce_test.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) {
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s," : "%12i  %12i  %6s  %6s",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllReduce with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-          prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/broadcast_scan.cu b/third_party/nccl/test/single/broadcast_scan.cu
deleted file mode 100644
index ea11c7d96d27a9..00000000000000
--- a/third_party/nccl/test/single/broadcast_scan.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** io;
-  double* localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  io = (void**)malloc(gpus*sizeof(void*));
-  CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&io[g], max_size));
-    if(g == 0) {
-      makeRandom(io[g], n_max, type, 42+g);
-      CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError+g, io[g], refout, n, type, stream[g]);
-    }
-    double maxError = 0.0;
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      maxError = max(maxError, localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(io[g]));
-  }
-
-  free(io);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  CUDACHECK(cudaFreeHost(localError));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/broadcast_test.cu b/third_party/nccl/test/single/broadcast_test.cu
deleted file mode 100644
index 6b1e04fb9d9ff0..00000000000000
--- a/third_party/nccl/test/single/broadcast_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-
-    if (i == root) {
-      Randomize(buff[root], N, root);
-      CUDACHECK(cudaMemcpy(result, buff[root], N * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      CUDACHECK(cudaMemset(buff[i], 0, N * sizeof(T)));
-    }
-
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %4i", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), root);
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9  / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(buff[i], result, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-            maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for(int i=0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** buff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(buff + i, N * sizeof(T)));
-  }
-
-  //for (int root = 1; root < 2; ++root) {
-  for (int root = 0; root < nDev; ++root) {
-    RunTest<T>(buff, N, type, root, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(buff[i]));
-  }
-
-  free(buff);
-}
-
-void usage() {
-  printf("Tests nccl Broadcast with user supplied arguments.\n"
-      "    Usage: broadcast_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  unsigned long long N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%llu", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %4s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "root", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scan.cu b/third_party/nccl/test/single/reduce_scan.cu
deleted file mode 100644
index f42643eb4db3c4..00000000000000
--- a/third_party/nccl/test/single/reduce_scan.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** input;
-  void* output; // always goes on rank 0
-  double* maxError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0) {
-      CUDACHECK(cudaMalloc(&output, max_size));
-      CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    } else {
-      accVec(refout, input[g], n_max, type, op);
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
-    for(int g=0; g<gpus; ++g)
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    maxDiff(maxError, output, refout, n, type, stream[0]);
-    CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, *maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    if(g == 0) {
-      CUDACHECK(cudaFree(output));
-      CUDACHECK(cudaFreeHost(maxError));
-    }
-  }
-
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scatter_scan.cu b/third_party/nccl/test/single/reduce_scatter_scan.cu
deleted file mode 100644
index 8c37508c4db921..00000000000000
--- a/third_party/nccl/test/single/reduce_scatter_scan.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_output = n_max * word;
-  size_t max_input = gpus * max_output;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max*gpus, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max*gpus, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      void* myRef = (void*)((char*)refout + g*bytes);
-      maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(gpus - 1);
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scatter_test.cu b/third_party/nccl/test/single/reduce_scatter_test.cu
deleted file mode 100644
index b702800925c503..00000000000000
--- a/third_party/nccl/test/single/reduce_scatter_test.cu
+++ /dev/null
@@ -1,285 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  T* buffer = (T*)malloc(N * nDev * sizeof(T));
-  T* result = (T*)malloc(N * nDev * sizeof(T));
-  memset(buffer, 0, N * nDev * sizeof(T));
-  memset(result, 0, N * nDev * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N * nDev, i);
-
-    if (i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N * nDev * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N * nDev, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
-        std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %6s", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  {
-    // now do in-place reduction
-    int n = N;
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * nDev * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl ReduceScatter with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %6s        out-of-place                      "
-      "in-place\n", "", "", "", "");
-  printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "op", "time", "algbw", "busbw", "delta", "time",
-      "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_test.cu b/third_party/nccl/test/single/reduce_test.cu
deleted file mode 100644
index 6abb49c45fa496..00000000000000
--- a/third_party/nccl/test/single/reduce_test.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, int root, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
-        type, op, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s,%d," : "%12i  %12i  %6s  %6s %4d",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str(), root);
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(recvbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(sendbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    for(int root=0; root<nDev; ++root) {
-      RunTest<T>(sendbuff, recvbuff, N, type, op, root, comms, dList);
-    }
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl Reduce with user supplied arguments.\n"
-      "    Usage: reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s  %4s        out-of-place                    in-place\n", "", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %4s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-               "bytes", "N", "type", "op", "root",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,root,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 8076d48428b1e0..ab795867c8f970 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -236,10 +236,10 @@ function build_nccl() {
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   mkdir -p ${INSTALL_DIR}/lib
   find lib -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "${INSTALL_DIR}/lib/"
-  if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then
-    ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so"
-  fi
   popd
+  if [ -f "./nccl/nccl/src/nccl.h" ]; then
+    rm ./nccl/nccl/src/nccl.h
+  fi
 }
 
 # purposefully not using build() because we need Caffe2 to build the same
@@ -309,7 +309,7 @@ function build_caffe2() {
   fi
 
   # This is needed by the aten tests built with caffe2
-  if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
+  if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.2" ]; then
       # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
       find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/"
   fi
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 7b2e80de13e784..ae2b0b0429608a 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -146,6 +146,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/import.cpp
   ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
   ${TORCH_SRC_DIR}/csrc/jit/constants.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/node_hashing.cpp
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
@@ -153,6 +154,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_pooling.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/inline_autodiff_subgraphs.cpp
diff --git a/torch/csrc/api/include/torch/detail/ordered_dict.h b/torch/csrc/api/include/torch/detail/ordered_dict.h
index cb7cc59a3df66d..b9209a38930a66 100644
--- a/torch/csrc/api/include/torch/detail/ordered_dict.h
+++ b/torch/csrc/api/include/torch/detail/ordered_dict.h
@@ -72,9 +72,10 @@ class OrderedDict {
 
   // Move works by default, because you can move-construct vectors of const
   // values..
-  OrderedDict(OrderedDict&& other) noexcept(
-      noexcept(std::unordered_map<Key, size_t>()) &&
-      noexcept(std::vector<Item>())) = default;
+  // NB: I tried to make this noexcept (conditional on the move constructors of
+  // index_ and items_ being noexcept) but the obvious spelling didn't compile
+  // on Windows.
+  OrderedDict(OrderedDict&& other) = default;
   OrderedDict& operator=(OrderedDict&& other) = default;
 
   ~OrderedDict() = default;
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index b27ceebce307e1..291620e69f5f2f 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -10,6 +10,7 @@
 #include "torch/csrc/jit/passes/annotate_effects.h"
 #include "torch/csrc/jit/passes/batch_mm.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/constant_pooling.h"
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/passes/erase_number_types.h"
@@ -444,6 +445,7 @@ struct GraphExecutorImpl {
   void runOptimization(std::shared_ptr<Graph>& graph, const ArgumentSpec& spec) {
     EliminateDeadCode(graph);
     EliminateCommonSubexpression(graph);
+    ConstantPooling(graph);
     UnrollLoops(graph);
     PeepholeOptimize(graph);
     CheckInplace(graph);
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 27acf046436326..3fc19d64356f4f 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -7,7 +7,6 @@
 #include "torch/csrc/jit/operator.h"
 
 #include <ATen/ATen.h>
-#include <ATen/InitialTensorOptions.h>
 
 #include <unordered_map>
 #include <vector>
@@ -22,26 +21,47 @@ namespace onnx = ::ONNX_NAMESPACE;
 
 // IR graph construction
 
-class DecoderBase {
- protected:
-  virtual std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto);
+class ModuleDecoder {
+ public:
+  ModuleDecoder(ModuleLookup module_lookup,
+                std::istream& in);
+
+ private:
+  std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto);
 
   void buildBlock(const onnx::GraphProto& graph_proto, Block* block,
-                   std::unordered_map<std::string, Value*>& value_map);
+                  std::unordered_map<std::string, Value*>& value_map);
 
   void buildBlocks(const std::vector<onnx::GraphProto>& graphs_, Node* node,
                    std::unordered_map<std::string, Value*>& value_map);
 
-  virtual void buildValue(Value* value, const onnx::ValueInfoProto& valueinfo_proto) {};
+  void buildValue(Value* value, const onnx::ValueInfoProto& valueinfo_proto);
 
-  virtual void buildIntermediateValue(Value* value, const std::string& name) {};
+  void buildIntermediateValue(Value* value, const std::string& name);
 
   at::ScalarType onnxTypeToATenType(onnx::TensorProto_DataType tensor_proto);
 
-  virtual at::Tensor buildTensor(const onnx::TensorProto& tensor_proto);
+  at::Tensor buildTensor(const onnx::TensorProto& tensor_proto);
+
+  TypePtr buildType(const onnx::TypeProto& type_proto);
+
+  at::Tensor buildParameter(const onnx::TensorProto& tensor_proto);
+
+  at::Tensor buildTensorCommon(const onnx::TensorProto& tensor_proto,
+                               const uint64_t record_number,
+                               const int64_t storage_offset,
+                               const std::vector<int64_t>& strides);
+
+  std::pair<std::shared_ptr<script::Module>, std::string> parseFullName(
+      ModuleLookup module_lookup,
+      const std::string fullname);
+
+  PyTorchStreamReader stream_reader_;
+  std::unordered_map<uint64_t, std::shared_ptr<at::Storage>> storage_map_;
+  std::unordered_map<std::string, const onnx::TypeProto*> value_type_map_;
 };
 
-at::ScalarType DecoderBase::onnxTypeToATenType(onnx::TensorProto_DataType onnx_type) {
+at::ScalarType ModuleDecoder::onnxTypeToATenType(onnx::TensorProto_DataType onnx_type) {
   switch(onnx_type) {
     case onnx::TensorProto_DataType_UINT8:
       return at::kByte;
@@ -64,21 +84,7 @@ at::ScalarType DecoderBase::onnxTypeToATenType(onnx::TensorProto_DataType onnx_t
   }
 }
 
-at::Tensor DecoderBase::buildTensor(const onnx::TensorProto& tensor_proto) {
-  at::Tensor tensor = at::empty({0}, at::initialTensorOptions().dtype(onnxTypeToATenType(tensor_proto.data_type())));
-  std::vector<int64_t> sizes = { tensor_proto.dims().begin(), tensor_proto.dims().end() };
-  tensor.resize_(sizes);
-
-  JIT_ASSERT(
-      tensor.storage().size() *
-          tensor.storage().elementSize() ==
-      tensor_proto.raw_data().size());
-
-  std::memcpy(tensor.data_ptr(), tensor_proto.raw_data().data(), tensor_proto.raw_data().size());
-  return tensor;
-}
-
-void DecoderBase::buildBlocks(
+void ModuleDecoder::buildBlocks(
     const std::vector<onnx::GraphProto>& graphs_, Node* node,
     std::unordered_map<std::string, Value*>& value_map) {
   for (auto g_ : graphs_) {
@@ -87,7 +93,7 @@ void DecoderBase::buildBlocks(
   }
 }
 
-std::shared_ptr<Graph> DecoderBase::buildGraph(const onnx::GraphProto& graph_proto) {
+std::shared_ptr<Graph> ModuleDecoder::buildGraph(const onnx::GraphProto& graph_proto) {
   auto graph = std::make_shared<Graph>();
   std::unordered_map<std::string, Value*> value_map;
 
@@ -96,8 +102,11 @@ std::shared_ptr<Graph> DecoderBase::buildGraph(const onnx::GraphProto& graph_pro
   return graph;
 }
 
-void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
+void ModuleDecoder::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
                 std::unordered_map<std::string, Value*>& value_map) {
+  for (auto &subtype : graph_proto.value_info()) {
+    value_type_map_[subtype.name()] = &subtype.type();
+  }
 
   for (auto & input : graph_proto.input()) {
     auto value = block->addInput();
@@ -180,45 +189,6 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
   }
 }
 
-class ModuleDecoder : DecoderBase {
- public:
-  ModuleDecoder(ModuleLookup module_lookup,
-                std::istream& in);
-
- private:
-  virtual std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto) override;
-
-  virtual at::Tensor buildTensor(const onnx::TensorProto& tensor_proto) override;
-
-  TypePtr buildType(const onnx::TypeProto& type_proto);
-
-  virtual void buildValue(Value* value, const onnx::ValueInfoProto& valueinfo_proto) override;
-
-  virtual void buildIntermediateValue(Value* value, const std::string& name) override;
-
-  at::Tensor buildParameter(const onnx::TensorProto& tensor_proto);
-
-  at::Tensor buildTensorCommon(const onnx::TensorProto& tensor_proto,
-                               const uint64_t record_number,
-                               const int64_t storage_offset,
-                               const std::vector<int64_t>& strides);
-
-  std::pair<std::shared_ptr<script::Module>, std::string> parseFullName(
-      ModuleLookup module_lookup,
-      const std::string fullname);
-
-  PyTorchStreamReader stream_reader_;
-  std::unordered_map<uint64_t, std::shared_ptr<at::Storage>> storage_map_;
-  std::unordered_map<std::string, const onnx::TypeProto*> value_type_map_;
-};
-
-std::shared_ptr<Graph> ModuleDecoder::buildGraph(const onnx::GraphProto& graph_proto) {
-  for (auto &subtype : graph_proto.value_info()) {
-    value_type_map_[subtype.name()] = &subtype.type();
-  }
-  return DecoderBase::buildGraph(graph_proto);
-}
-
 TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
   auto tensortype_proto = type_proto.tensor_type();
   auto shape_proto = tensortype_proto.shape();
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index e84269513bcfe6..12d05539bc869b 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -15,6 +15,7 @@
 #include "torch/csrc/jit/passes/erase_number_types.h"
 #include "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/constant_pooling.h"
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
 #include "torch/csrc/jit/passes/peephole.h"
 #include "torch/csrc/jit/passes/canonicalize.h"
@@ -88,6 +89,7 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_cse", [](std::shared_ptr<Graph>& g) {
      return EliminateCommonSubexpression(g); // overload resolution
    })
+   .def("_jit_pass_constant_pooling", ConstantPooling)
    .def("_jit_pass_peephole", PeepholeOptimize)
    .def("_jit_pass_canonicalize", [](const std::shared_ptr<Graph>& g) {
      return Canonicalize(g);
diff --git a/torch/csrc/jit/node_hashing.cpp b/torch/csrc/jit/node_hashing.cpp
new file mode 100644
index 00000000000000..4ba43f90253c21
--- /dev/null
+++ b/torch/csrc/jit/node_hashing.cpp
@@ -0,0 +1,113 @@
+#include "torch/csrc/jit/ir.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/node_hashing.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/hash.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+bool tensorEqual(const at::Tensor& lhs, const at::Tensor& rhs) {
+  return &lhs.type() == &rhs.type() && lhs.equal(rhs);
+}
+
+bool tensorListEqual(const std::vector<at::Tensor>& lhs, const std::vector<at::Tensor>& rhs) {
+  if (lhs.size() != rhs.size()) return false;
+  return std::equal(lhs.begin(), lhs.end(), rhs.begin(), tensorEqual);
+}
+
+
+// Check whether two nodes have the same attributes in CSE.
+// This function may be too conservative for general use.
+// Do NOT support g/gs attributes.
+bool attributesEqualCSE(const Node* lhs, const Node* rhs) {
+  JIT_ASSERT(lhs != nullptr);
+  JIT_ASSERT(rhs != nullptr);
+  // One has attributes, the other does not.
+  if (lhs->hasAttributes() != rhs->hasAttributes()) return false;
+  // Neither has attributes.
+  if (!lhs->hasAttributes() && !rhs->hasAttributes()) return true;
+
+  auto lnames = lhs->attributeNames();
+  auto rnames = rhs->attributeNames();
+  std::sort(lnames.begin(), lnames.end());
+  std::sort(rnames.begin(), rnames.end());
+  if (lnames != rnames) return false;
+
+  for (auto name : lnames) {
+    if (lhs->kindOf(name) != rhs->kindOf(name)) return false;
+
+    #define COMPARE_ATTRIBUTEVALUE(type) \
+      case AttributeKind::type: \
+        { if (lhs->type(name) != rhs->type(name)) return false; } break;
+
+    switch(lhs->kindOf(name)) {
+      COMPARE_ATTRIBUTEVALUE(f)
+      COMPARE_ATTRIBUTEVALUE(fs)
+      COMPARE_ATTRIBUTEVALUE(i)
+      COMPARE_ATTRIBUTEVALUE(is)
+      COMPARE_ATTRIBUTEVALUE(s)
+      COMPARE_ATTRIBUTEVALUE(ss)
+      case AttributeKind::t: {
+        if (!tensorEqual(lhs->t(name), rhs->t(name))) return false;
+        break;
+      }
+      case AttributeKind::ts: {
+        if (!tensorListEqual(lhs->ts(name), rhs->ts(name))) return false;
+        break;
+      }
+      case AttributeKind::g:
+      case AttributeKind::gs:
+        return false;
+    }
+
+    #undef COMPARE_ATTRIBUTEVALUE
+  }
+
+  return true;
+}
+
+} // anonymous namespace
+
+
+size_t HashNode::operator()(const Node* k) const {
+  JIT_ASSERT(k != nullptr);
+  return get_hash(k->kind(),
+                  fmap(k->outputs(), [](const Value *v) { return v->type()->kind(); }),
+                  fmap(k->inputs(), [](const Value *v) { return v->unique(); }));
+};
+
+bool EqualNode::operator()(const Node* lhs, const Node* rhs) const {
+  if (lhs == nullptr && rhs == nullptr) return true;
+  if (lhs == nullptr || rhs == nullptr) return false;
+
+  if (lhs->kind() != rhs->kind()) return false;
+
+  // Check whether the output types are the same.
+  auto lhs_outputs = lhs->outputs();
+  auto rhs_outputs = rhs->outputs();
+  if (lhs_outputs.size() != rhs_outputs.size()) return false;
+  for (size_t i = 0; i < lhs_outputs.size(); ++i) {
+    if (*lhs_outputs[i]->type() != *rhs_outputs[i]->type())
+      return false;
+  }
+
+  // Check whether the inputs are the same.
+  auto lhs_inputs = lhs->inputs();
+  auto rhs_inputs = rhs->inputs();
+  if (lhs_inputs.size() != rhs_inputs.size()) return false;
+  if (!std::equal(lhs_inputs.begin(), lhs_inputs.end(), rhs_inputs.begin())) return false;
+
+  if (!attributesEqualCSE(lhs, rhs)) return false;
+
+  return true;
+};
+
+}}
diff --git a/torch/csrc/jit/node_hashing.h b/torch/csrc/jit/node_hashing.h
new file mode 100644
index 00000000000000..45f10c9dee29e9
--- /dev/null
+++ b/torch/csrc/jit/node_hashing.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+struct HashNode {
+  size_t operator()(const Node* k) const;
+};
+
+struct EqualNode {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+}}
diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.cpp b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
index 3281fe2633e81f..fcafe1bdef51ce 100644
--- a/torch/csrc/jit/passes/common_subexpression_elimination.cpp
+++ b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
@@ -6,118 +6,17 @@
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/node_hashing.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/hash.h"
 
 namespace torch { namespace jit {
 
-namespace {
-
-bool tensorEqual(const at::Tensor& lhs, const at::Tensor& rhs) {
-  return &lhs.type() == &rhs.type() && lhs.equal(rhs);
-}
-
-bool tensorListEqual(const std::vector<at::Tensor>& lhs, const std::vector<at::Tensor>& rhs) {
-  if (lhs.size() != rhs.size()) return false;
-  return std::equal(lhs.begin(), lhs.end(), rhs.begin(), tensorEqual);
-}
-
-
-// Check whether two nodes have the same attributes in CSE.
-// This function may be too conservative for general use.
-// Do NOT support t/ts/g/gs attributes.
-// If t/ts are supported, CONSTANT node comparison may need to consider device.
-bool attributesEqualCSE(const Node* lhs, const Node* rhs) {
-  JIT_ASSERT(lhs != nullptr);
-  JIT_ASSERT(rhs != nullptr);
-  // One has attributes, the other does not.
-  if (lhs->hasAttributes() != rhs->hasAttributes()) return false;
-  // Neither has attributes.
-  if (!lhs->hasAttributes() && !rhs->hasAttributes()) return true;
-
-  auto lnames = lhs->attributeNames();
-  auto rnames = rhs->attributeNames();
-  std::sort(lnames.begin(), lnames.end());
-  std::sort(rnames.begin(), rnames.end());
-  if (lnames != rnames) return false;
-
-  for (auto name : lnames) {
-    if (lhs->kindOf(name) != rhs->kindOf(name)) return false;
-
-    #define COMPARE_ATTRIBUTEVALUE(type) \
-      case AttributeKind::type: \
-        { if (lhs->type(name) != rhs->type(name)) return false; } break;
-
-    switch(lhs->kindOf(name)) {
-      COMPARE_ATTRIBUTEVALUE(f)
-      COMPARE_ATTRIBUTEVALUE(fs)
-      COMPARE_ATTRIBUTEVALUE(i)
-      COMPARE_ATTRIBUTEVALUE(is)
-      COMPARE_ATTRIBUTEVALUE(s)
-      COMPARE_ATTRIBUTEVALUE(ss)
-      case AttributeKind::t: {
-        if (!tensorEqual(lhs->t(name), rhs->t(name))) return false;
-        break;
-      }
-      case AttributeKind::ts: {
-        if (!tensorListEqual(lhs->ts(name), rhs->ts(name))) return false;
-        break;
-      }
-      case AttributeKind::g:
-      case AttributeKind::gs:
-        return false;
-    }
-
-    #undef COMPARE_ATTRIBUTEVALUE
-  }
-
-  return true;
-}
-
-struct HashNodeCSE {
-  size_t operator()(const Node* k) const {
-    JIT_ASSERT(k != nullptr);
-    return get_hash(k->kind(),
-                    fmap(k->outputs(), [](const Value *v) { return v->type()->kind(); }),
-                    fmap(k->inputs(), [](const Value *v) { return v->unique(); }));
-  }
-};
-
-struct EqualNodeCSE {
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    if (lhs == nullptr && rhs == nullptr) return true;
-    if (lhs == nullptr || rhs == nullptr) return false;
-
-    if (lhs->kind() != rhs->kind()) return false;
-
-    // Check whether the output types are the same.
-    auto lhs_outputs = lhs->outputs();
-    auto rhs_outputs = rhs->outputs();
-    if (lhs_outputs.size() != rhs_outputs.size()) return false;
-    for (size_t i = 0; i < lhs_outputs.size(); ++i) {
-      if (*lhs_outputs[i]->type() != *rhs_outputs[i]->type())
-        return false;
-    }
-
-    // Check whether the inputs are the same.
-    auto lhs_inputs = lhs->inputs();
-    auto rhs_inputs = rhs->inputs();
-    if (lhs_inputs.size() != rhs_inputs.size()) return false;
-    if (!std::equal(lhs_inputs.begin(), lhs_inputs.end(), rhs_inputs.begin())) return false;
-
-    if (!attributesEqualCSE(lhs, rhs)) return false;
-
-    return true;
-  }
-};
-
-} // anonymous namespace
-
 // The function implements common subexpression elimination.
 // Since the nodes are visited in topological order, one pass is enough.
 void EliminateCommonSubexpression(Block * block,
                                   std::function<Node*(Node*)> parent_lookup_fn) {
-  std::unordered_set<Node*, HashNodeCSE, EqualNodeCSE> subexprs;
+  std::unordered_set<Node*, HashNode, EqualNode> subexprs;
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++ it) {
     auto node = *it;
     if (node->kind() == prim::PythonOp
diff --git a/torch/csrc/jit/passes/constant_pooling.cpp b/torch/csrc/jit/passes/constant_pooling.cpp
new file mode 100644
index 00000000000000..ecb2c78ee6fd53
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_pooling.cpp
@@ -0,0 +1,53 @@
+#include "torch/csrc/jit/ir.h"
+#include <unordered_set>
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/passes/constant_pooling.h"
+#include "torch/csrc/jit/node_hashing.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+//Very similar to the common subexpression elimination pass
+//Move all constants to the beginning of the graph, and deduplicate
+void ConstantPooling(Block * block, std::unordered_set<Node*, HashNode, EqualNode>& constants) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+    auto node = *it;
+    // node may be moved to a different block so advance iterator now
+    ++it;
+    if (!node->blocks().empty()) {
+      // Traverse sub-blocks.
+      for (auto block : node->blocks()) {
+        ConstantPooling(block, constants);
+      }
+      continue;
+    }
+
+    if (node->kind() != prim::Constant) {
+      continue;
+    }
+
+    auto first_node = node->owningGraph()->block()->nodes().front();
+    if (node != first_node)
+      node->moveBefore(first_node);
+
+    // Check whether the same constant already exists.
+    auto subit = constants.insert(node);
+    if (!subit.second) {
+      // constant exists, replace the uses of node, and destroy it.
+      auto existing = *subit.first;
+      node->replaceAllUsesWith(existing);
+      node->destroy();
+    }
+  }
+}
+
+} // anonymous namespace
+
+
+void ConstantPooling(const std::shared_ptr<Graph>& graph) {
+  std::unordered_set<Node*, HashNode, EqualNode> constants;
+  ConstantPooling(graph->block(), constants);
+}
+
+}}
diff --git a/torch/csrc/jit/passes/constant_pooling.h b/torch/csrc/jit/passes/constant_pooling.h
new file mode 100644
index 00000000000000..284cb90bb29cea
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_pooling.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+TORCH_API void ConstantPooling(const std::shared_ptr<Graph>& graph);
+
+}}
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 474722f5362031..36757f97e6e7f0 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1,6 +1,7 @@
 #include "torch/csrc/jit/script/compiler.h"
 #include "torch/csrc/jit/passes/lower_tuples.h"
 #include "torch/csrc/jit/passes/annotate_effects.h"
+#include "torch/csrc/jit/passes/constant_pooling.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/jit/interpreter.h"
 #include "torch/csrc/jit/ir.h"
@@ -903,6 +904,7 @@ struct to_ir {
     AnnotateEffects(graph);
     // remove any uses of tuples that we inserted that are not needed
     LowerSimpleTuples(graph);
+    ConstantPooling(graph);
   }
 
 private:
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 7295b56150c0f5..4f8b8abfb14968 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -641,6 +641,10 @@ def le(g, input, other):
     return g.op("Not", gt(g, input, _if_scalar_type_as(g, other, input)))
 
 
+def where(g, condition, self, other):
+    return g.op("ATen", condition, self, other, operator_s="where")
+
+
 @parse_args('v', 'i')
 def log_softmax(g, input, dim=None):
     # PyTorch dim and ONNX axis have different meanings.