ROCm · iotamudelta · Aug 27, 2018 · Aug 21, 2018 · Aug 21, 2018 · Aug 21, 2018
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,6 @@
 [submodule "third_party/catch"]
 	path = third_party/catch
 	url = https://github.com/catchorg/Catch2.git
-[submodule "third_party/nanopb"]
-	path = third_party/nanopb
-	url = https://github.com/nanopb/nanopb.git
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git

diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
@@ -107,10 +107,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
   PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
   report_compile_cache_stats
   exit 0
-elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then
-  rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX
-  PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX
-  exit 0
 fi
 
 
@@ -156,6 +152,11 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
   export LC_ALL=C.UTF-8
   export HCC_AMDGPU_TARGET=gfx900
 
+  # The link time of libcaffe2_hip.so takes 40 minutes, according to
+  # https://github.com/RadeonOpenCompute/hcc#thinlto-phase-1---implemented
+  # using using ThinLTO could significantly improve link-time performance.
+  export KMTHINLTO=1
+
   ########## HIPIFY Caffe2 operators
   ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py"
   ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -140,7 +140,6 @@ cmake_dependent_option(
   USE_MKLML "Use MKLML interface in MKL BLAS" ON
     "BUILD_CAFFE2" OFF)
 option(USE_DISTRIBUTED "Use THD (distributed)" OFF)
-option(USE_DISTRIBUTED_MW "Use THD (distributed) master worker" OFF)
 
 # Used when building Caffe2 through setup.py
 option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -182,6 +182,8 @@ information for the code in `torch/csrc`. More information at:
 Python `setuptools` is pretty dumb, and always rebuilds every C file in a
 project.  If you install the ninja build system with `pip install ninja`,
 then PyTorch will use it to track dependencies correctly.
+If pytorch was already built, you will need to run `python setup.py clean` once
+after installing ninja for builds to succeed.
 
 #### Use CCache
 

diff --git a/aten/doc/Functions.h b/aten/doc/Functions.h
@@ -331,6 +331,7 @@ static inline Tensor & _standard_gamma_out(Tensor & output, const Tensor & self,
 static inline Tensor _standard_gamma(const Tensor & self, Generator * generator=nullptr);
 static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, const Tensor & alpha, const Tensor & total);
 static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total);
+static inline Tensor sparse_coo_tensor(const Type& dtype, IntList size);
 static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size);
 static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values);
 static inline Tensor alias(const Tensor & self);
@@ -1764,6 +1765,9 @@ static inline Tensor & _dirichlet_grad_out(Tensor & output, const Tensor & x, co
 static inline Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total) {
     return infer_type(x)._dirichlet_grad(x, alpha, total);
 }
+static inline Tensor sparse_coo_tensor(const Type& dtype, IntList size) {
+    return dtype.sparse_coo_tensor(dtype, size);
+}
 static inline Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) {
     return infer_type(values).sparse_coo_tensor(indices, values, size);
 }

diff --git a/aten/doc/Tensor.h b/aten/doc/Tensor.h
@@ -396,7 +396,8 @@ struct Tensor : public detail::TensorBase {
   Tensor & _copy_ignoring_overlaps_(const Tensor & src);
   Tensor as_strided(IntList size, IntList stride, int64_t storage_offset=-1) const;
   Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset=-1);
-  Tensor & sparse_raw_resize_(IntList size, int64_t nDimI, int64_t nDimV);
+  Tensor & sparse_resize_(IntList size, int64_t nDimI, int64_t nDimV);
+  Tensor & sparse_resize_and_clear_(IntList size, int64_t nDimI, int64_t nDimV);
   Tensor & reshape_(IntList size, IntList stride);
   Tensor _sparse_mask(SparseTensor mask) const;
   Tensor to_dense() const;

diff --git a/aten/doc/Type.h b/aten/doc/Type.h
@@ -656,14 +656,16 @@ struct AT_API Type {
   virtual Tensor tensor(IntList size) const;
   virtual Tensor tensor(IntList size, IntList stride) const;
   virtual Tensor tensor() const;
+  virtual Tensor sparse_coo_tensor(IntList size) const;
   virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const;
   virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const;
   virtual Tensor alias(const Tensor & self) const;
   virtual Tensor & _copy_ignoring_overlaps_(Tensor & self, const Tensor & src) const;
   virtual Tensor & as_strided_out(Tensor & result, const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
   virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
   virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset=-1) const;
-  virtual Tensor & sparse_raw_resize_(Tensor & self, IntList size, int64_t nDimI, int64_t nDimV) const;
+  virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t nDimI, int64_t nDimV) const;
+  virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t nDimI, int64_t nDimV) const;
   virtual Tensor & _cat_out(Tensor & self, TensorList tensors, int64_t dim=0) const;
   virtual Tensor _cat(TensorList tensors, int64_t dim=0) const;
   virtual Tensor & reshape_(Tensor & self, IntList size, IntList stride) const;

diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ATen/ATenGeneral.h"
+#include "ATen/core/ATenGeneral.h"
 #include "ATen/Allocator.h"
 #include "ATen/CPUGeneral.h"
 #include "ATen/CUDAGuard.h"
@@ -11,8 +11,8 @@
 #include "ATen/Dispatch.h"
 #include "ATen/Formatting.h"
 #include "ATen/Functions.h"
-#include "ATen/Generator.h"
-#include "ATen/Layout.h"
+#include "ATen/core/Generator.h"
+#include "ATen/core/Layout.h"
 #include "ATen/OptionsGuard.h"
 #include "ATen/Scalar.h"
 #include "ATen/Storage.h"

diff --git a/aten/src/ATen/ATenGeneral.cpp b/aten/src/ATen/ATenGeneral.cpp
diff --git a/aten/src/ATen/Allocator.cpp b/aten/src/ATen/Allocator.cpp
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
@@ -1,103 +1,2 @@
 #pragma once
-
-#include <memory>
-#include <stddef.h>
-
-#include <ATen/Device.h>
-#include <ATen/core/Error.h>
-#include <ATen/core/UniqueVoidPtr.h>
-
-namespace at {
-
-// A DataPtr is a unique pointer (with an attached deleter and some
-// context for the deleter) to some memory, which also records what
-// device is for its data.
-//
-// nullptr DataPtrs can still have a nontrivial device; this allows
-// us to treat zero-size allocations uniformly with non-zero allocations.
-//
-class DataPtr {
-private:
-  detail::UniqueVoidPtr ptr_;
-  Device device_;
-public:
-  // Choice of CPU here is arbitrary; if there's an "undefined" device
-  // we could use that too
-  DataPtr() : ptr_(), device_(DeviceType::CPU) {}
-  DataPtr(void* data, Device device)
-    : ptr_(data), device_(device) {}
-  DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
-    : ptr_(data, ctx, ctx_deleter), device_(device) {}
-  void* operator->() const { return ptr_.get(); }
-  void clear() {
-    ptr_.clear();
-  }
-  void* get() const { return ptr_.get(); }
-  void* get_context() const { return ptr_.get_context(); }
-  void* release_context() { return ptr_.release_context(); }
-  operator bool() const { return static_cast<bool>(ptr_); }
-  template <typename T>
-  T* cast_context(DeleterFnPtr expected_deleter) const {
-    return ptr_.cast_context<T>(expected_deleter);
-  }
-  Device device() const { return device_; }
-};
-
-// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a
-// CPU nullptr
-
-inline bool operator==(const at::DataPtr& dp, std::nullptr_t) noexcept { return !dp; }
-inline bool operator==(std::nullptr_t, const at::DataPtr& dp) noexcept { return !dp; }
-inline bool operator!=(const at::DataPtr& dp, std::nullptr_t) noexcept { return dp; }
-inline bool operator!=(std::nullptr_t, const at::DataPtr& dp) noexcept { return dp; }
-
-// Note [raw_allocate/raw_deallocate and Thrust]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// Thrust's support for custom allocators requires us to write something
-// like this:
-//
-//  class ThrustAllocator {
-//    char* allocate(size_t);
-//    void deallocate(char*, size_t);
-//  };
-//
-// This is not good for our unique_ptr based allocator interface, as
-// there is no way to get to the context when we free.
-//
-// However, in some cases the context is exactly the same as
-// the data pointer.  In this case, we can support the "raw"
-// allocate and deallocate interface.  This is what
-// raw_deleter signifies.  By default, it returns a nullptr, which means that
-// the raw interface is not implemented.  Be sure to implement it whenever
-// possible, or the raw interface will incorrectly reported as unsupported,
-// when it is actually possible.
-
-struct Allocator {
-  virtual ~Allocator() {}
-  virtual at::DataPtr allocate(size_t n) const = 0;
-
-  // If this returns a non nullptr, it means that allocate()
-  // is guaranteed to return a unique_ptr with this deleter attached;
-  // it means the rawAllocate and rawDeallocate APIs are safe to use.
-  // This function MUST always return the same BoundDeleter.
-  virtual DeleterFnPtr raw_deleter() const { return nullptr; }
-  void* raw_allocate(size_t n) {
-    auto dptr = allocate(n);
-    AT_ASSERT(dptr.get() == dptr.get_context());
-    return dptr.release_context();
-  }
-  void raw_deallocate(void* ptr) {
-    auto d = raw_deleter();
-    AT_ASSERT(d);
-    d(ptr);
-  }
-};
-
-struct AT_API InefficientStdFunctionContext {
-  std::unique_ptr<void, std::function<void(void*)>> ptr_;
-  InefficientStdFunctionContext(std::unique_ptr<void, std::function<void(void*)>>&& ptr)
-    : ptr_(std::move(ptr)) {}
-  static at::DataPtr makeDataPtr(void* ptr, const std::function<void(void*)>& deleter, Device device);
-};
-
-}  // namespace at
+#include <ATen/core/Allocator.h>