ROCm · iotamudelta · Jul 20, 2018 · Jul 17, 2018 · Jul 17, 2018 · Jul 17, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,7 @@
+version: 2
+jobs:
+  build:
+    docker:
+      - image: circleci/python:3.7-node-browsers
+    steps:
+      - run: echo "hello world"
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
@@ -155,6 +155,9 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
   export LANG=C.UTF-8
   export LC_ALL=C.UTF-8
   export HCC_AMDGPU_TARGET=gfx900
+
+  ########## HIPIFY Caffe2 operators
+  ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
 fi
 
 # Try to include Redis support for Linux builds
@@ -195,6 +198,7 @@ else
 fi
 
 
+
 ###############################################################################
 # Configure and make
 ###############################################################################

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
@@ -104,5 +104,5 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
   echo "Building libtorch"
   # NB: Install outside of source directory (at the same level as the root
   # pytorch folder) so that it doesn't get cleaned away prior to docker push.
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$PWD/../cpp-build"
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$PWD/../cpp-build"
 fi
diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt
@@ -3,3 +3,5 @@
 # fail.  You can use this to temporarily reserve a test name to
 # turn on CI side before PyTorch repository supports it.  This
 # file has the same format as .jenkins/enabled-configs.txt
+
+py2-clang3.8-rocmnightly-ubuntu16.04-test
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
@@ -57,15 +57,15 @@ test_cpp_api() {
   CPP_BUILD="$PWD/../cpp-build"
   rm -rf $CPP_BUILD
   mkdir -p $CPP_BUILD
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$CPP_BUILD"
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$CPP_BUILD"
 
   python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 
   # Unfortunately it seems like the test can't load from miniconda3
   # without these paths being set
   export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib"
   export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib"
-  "$CPP_BUILD"/libtorch/bin/test_api
+  "$CPP_BUILD"/caffe2/bin/test_api
 }
 
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -9,11 +9,6 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 echo "Testing pytorch"
 
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  echo "Skipping ROCm tests for now"
-  exit 0
-fi
-
 # JIT C++ extensions require ninja.
 git clone https://github.com/ninja-build/ninja --quiet
 pushd ninja
@@ -49,13 +44,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
-export ATEN_DISABLE_AVX=
-export ATEN_DISABLE_AVX2=
 if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
-  export ATEN_DISABLE_AVX=1
-fi
-if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
-  export ATEN_DISABLE_AVX2=1
+  export ATEN_CPU_CAPABILITY=default
+elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
+  export ATEN_CPU_CAPABILITY=avx
 fi
 
 test_python_nn() {
@@ -104,12 +96,12 @@ test_libtorch() {
      echo "Testing libtorch"
      CPP_BUILD="$PWD/../cpp-build"
      if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-       "$CPP_BUILD"/libtorch/bin/test_jit
+       "$CPP_BUILD"/caffe2/bin/test_jit
      else
-       "$CPP_BUILD"/libtorch/bin/test_jit "[cpu]"
+       "$CPP_BUILD"/caffe2/bin/test_jit "[cpu]"
      fi
      python tools/download_mnist.py --quiet -d test/cpp/api/mnist
-     OMP_NUM_THREADS=2 "$CPP_BUILD"/libtorch/bin/test_api
+     OMP_NUM_THREADS=2 "$CPP_BUILD"/caffe2/bin/test_api
   fi
 }
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -53,6 +53,7 @@ endif()
 # Note to developers: if you add an option below, make sure you also add it to
 # cmake/Summary.cmake so that the summary prints out the option values.
 include(CMakeDependentOption)
+option(BUILD_TORCH "Build Torch" OFF)
 option(BUILD_CAFFE2 "Build Caffe2" ON)
 option(BUILD_ATEN "Build ATen" OFF)
 option(BUILD_BINARY "Build C++ binaries" ON)
@@ -214,6 +215,7 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
   # These flags are not available in GCC-4.8.5. Set only when using clang.

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -72,6 +72,9 @@ For example:
 
 You do not need to repeatedly install after modifying python files.
 
+In case you want to reinstall, make sure that you uninstall pytorch first by running `pip uninstall torch`
+and `python setup.py clean`. Then you can install in `build develop` mode again.
+
 ## Unit testing
 
 PyTorch's testing is located under `test/`. Run the entire test suite with
@@ -146,9 +149,7 @@ working on:
 
 - Working on `torch/lib` and want to run your changes / rerun cmake?  Run
   `python setup.py build_deps`.  Note that this will rerun cmake for
-  every subdirectory in TH; if you are only working on one project,
-  consider editing `torch/lib/build_all.sh` and commenting out the
-  `build` lines of libraries you are not working on.
+  every subdirectory in TH.
 
 On the initial build, you can also speed things up with the environment
 variables `DEBUG` and `NO_CUDA`.

diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
@@ -80,14 +80,20 @@ add_subdirectory(src/TH)
 set(TH_CPU_INCLUDE
   # dense
   ${CMAKE_CURRENT_SOURCE_DIR}/src/TH
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
   ${CMAKE_CURRENT_BINARY_DIR}/src/TH
-  ${CMAKE_CURRENT_BINARY_DIR}/src/THC
-
   ${CMAKE_CURRENT_SOURCE_DIR}/src
   ${CMAKE_CURRENT_BINARY_DIR}/src
   ${CMAKE_BINARY_DIR}/aten/src)
 list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE})
+
+if(USE_CUDA OR USE_ROCM)
+  set(TH_CUDA_INCLUDE
+    # dense
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
+    ${CMAKE_CURRENT_BINARY_DIR}/src/THC)
+  list(APPEND ATen_CUDA_INCLUDE ${TH_CUDA_INCLUDE})
+endif()
+
 add_subdirectory(src/THNN)
 
 # Find the HIP package, set the HIP paths, load the HIP CMake.

diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
@@ -21,3 +21,4 @@
 #include "ATen/TensorOptions.h"
 #include "ATen/Layout.h"
 #include "ATen/OptionsGuard.h"
+#include "ATen/CUDAGuard.h"
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
@@ -30,6 +30,9 @@ class DataPtr {
   DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
     : ptr_(data, ctx, ctx_deleter), device_(device) {}
   void* operator->() const { return ptr_.get(); }
+  void clear() {
+    ptr_.clear();
+  }
   void* get() const { return ptr_.get(); }
   void* get_context() const { return ptr_.get_context(); }
   void* release_context() { return ptr_.release_context(); }

diff --git a/aten/src/ATen/CUDAGuard.h b/aten/src/ATen/CUDAGuard.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/CUDAStream.h>
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace at {
+
+/// A variant of `DeviceGuard` that augments it with an understanding of CUDA
+/// streams. This guard can not only set and reset the current CUDA device, but
+/// also set and reset the current CUDA stream. It is important to note that
+/// because a CUDA stream is intrinsically associated with the CUDA device to
+/// which it is bound, setting the CUDA stream *also* sets the current CUDA
+/// device to that of the stream.
+struct CUDAGuard {
+  /// Default constructor, does nothing and causes no change in the current
+  /// stream or device until `set_stream` or `set_device` is called.
+  CUDAGuard() = default;
+
+  /// Sets the CUDA stream and its associated device as the current one (calls
+  /// `set_stream`).
+  explicit CUDAGuard(const CUDAStream& stream) {
+    set_stream(stream);
+  }
+
+  /// Calls `set_device` with the given index.
+  explicit CUDAGuard(int32_t device) {
+    set_device(device);
+  }
+
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard(CUDAGuard&& other) noexcept = default;
+
+  /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard& operator=(CUDAGuard&& other) {
+    device_guard_ = std::move(other.device_guard_);
+    original_streams_ = std::move(other.original_streams_);
+    other.original_streams_.clear();
+    return *this;
+  }
+
+  /// Resets the CUDA stream on each device to the one that was active upon
+  /// construction.
+  ~CUDAGuard() {
+    if (!original_streams_.empty()) {
+      for (size_t device = 0; device < original_streams_.size(); ++device) {
+        globalContext().uncheckedSetCurrentCUDAStreamOnDevice(
+            device, original_streams_[device]);
+      }
+    }
+  }
+
+  /// Sets the current CUDA device to the device associated with the given
+  /// stream, and then sets the current stream on that device to the one given.
+  void set_stream(const CUDAStream& stream) {
+    device_guard_.set_index(stream.device());
+    // If we haven't stored the current stream yet, store it now.
+    if (original_streams_.empty()) {
+      const size_t device_count = globalContext().getNumGPUs();
+      original_streams_.reserve(device_count);
+      for (size_t device = 0; device < device_count; ++device) {
+        original_streams_.push_back(
+            globalContext().getCurrentCUDAStreamOnDevice(device));
+      }
+    }
+    globalContext().setCurrentCUDAStreamOnDevice(
+        device_guard_.last_index(), stream);
+  }
+
+  /// Sets the CUDA device to the given one.
+  void set_device(int32_t device) {
+    device_guard_.set_index(device);
+  }
+
+  /// Returns the CUDA streams that were active in the first call to
+  /// `set_stream`. If there was no such call, the returned container is
+  /// empty.
+  ArrayRef<CUDAStream> original_streams() const noexcept {
+    return original_streams_;
+  }
+
+  /// Returns the device that was set upon construction of the guard.
+  int32_t original_device() const noexcept {
+    return device_guard_.original_index();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any.
+  int32_t last_device() const noexcept {
+    return device_guard_.last_index();
+  }
+
+ private:
+  /// The guard for the current device.
+  DeviceGuard device_guard_;
+  /// The original streams that were active on all devices.
+  std::vector<CUDAStream> original_streams_;
+};
+
+} // namespace at