ROCm
diff --git a/‎.jenkins/pytorch/build.sh
Lines changed: 0 additions & 11 deletions b/‎.jenkins/pytorch/build.sh
Lines changed: 0 additions & 11 deletions
diff --git a/‎.jenkins/pytorch/enabled-configs.txt
Lines changed: 2 additions & 2 deletions b/‎.jenkins/pytorch/enabled-configs.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/TensorImpl.cpp
Lines changed: 10 additions & 0 deletions b/‎aten/src/ATen/core/TensorImpl.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/TensorImpl.h
Lines changed: 19 additions & 36 deletions b/‎aten/src/ATen/core/TensorImpl.h
Lines changed: 19 additions & 36 deletions
diff --git a/‎aten/src/ATen/core/WrapDimMinimal.h
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/core/WrapDimMinimal.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/context_base.cpp
Lines changed: 11 additions & 0 deletions b/‎aten/src/ATen/core/context_base.cpp
Lines changed: 11 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/context_base.h
Lines changed: 19 additions & 7 deletions b/‎aten/src/ATen/core/context_base.h
Lines changed: 19 additions & 7 deletions
diff --git a/‎aten/src/ATen/function_wrapper.py
Lines changed: 9 additions & 8 deletions b/‎aten/src/ATen/function_wrapper.py
Lines changed: 9 additions & 8 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/sparse/SparseTensor.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/sparse/SparseTensor.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/sparse/SparseTensorMath.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/sparse/SparseTensorMath.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/sparse/SparseUtils.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/sparse/SparseUtils.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/templates/TypeDefault.cpp
Lines changed: 7 additions & 14 deletions b/‎aten/src/ATen/templates/TypeDefault.cpp
Lines changed: 7 additions & 14 deletions
@@ -102,17 +102,6 @@ fi
 # Add the test binaries so that they won't be git clean'ed away
 git add -f build/bin
 
-# Test C FFI plugins
-# cffi install doesn't work for Python 3.7
-if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
-  # TODO: Don't run this here
-  pip install cffi
-  git clone https://github.com/pytorch/extension-ffi.git
-  pushd extension-ffi/script
-  python build.py
-  popd
-fi
-
 # Test documentation build
 if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
 
@@ -40,8 +40,8 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
 pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
-py2-clang3.8-rocm1.7.1-ubuntu16.04-build
-py2-clang3.8-rocm1.7.1-ubuntu16.04-test
+py2-clang7-rocmdeb-ubuntu16.04-build
+py2-clang7-rocmdeb-ubuntu16.04-test
 pytorch-ppc64le-cuda9.2-cudnn7-py3-build
 pytorch-ppc64le-cuda9.2-cudnn7-py3-test
 pytorch-ppc64le-cuda9.1-cudnn7-py3-build
 
@@ -45,13 +45,20 @@ IntList TensorImpl::sizes() const {
 }
 
 IntList TensorImpl::strides() const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   return strides_;
 }
 
 bool TensorImpl::compute_contiguous() const {
   bool is_contiguous = true;
   if (is_empty())
     return is_contiguous;
+  if (strides_.empty()) {
+    // Special case for Caffe2 tensors which don't have strides set.
+    return true;
+  }
   int64_t z = 1;
   for (int64_t d = dim() - 1; d >= 0; d--) {
     if (size(d) != 1) {
@@ -82,6 +89,9 @@ int64_t TensorImpl::size(int64_t d) const {
 }
 
 int64_t TensorImpl::stride(int64_t d) const {
+  AT_ASSERTM(strides_.size() == sizes_.size(),
+             "Caffe2 tensors don't (yet) have meaningful strides and cannot "
+             "be used in PyTorch.");
   d = at::maybe_wrap_dim(d, dim(), false);
   return strides_[d];
 }
 
@@ -10,6 +10,7 @@
 #include "ATen/core/LegacyTypeDispatch.h"
 #include "ATen/core/Backend.h"
 #include "ATen/core/context_base.h"
+#include "ATen/core/WrapDimMinimal.h"
 
 #include "caffe2/core/allocator.h"
 #include "caffe2/core/common.h"
@@ -89,16 +90,6 @@ inline int64_t size_between_dim_(int k, int l, IntList dims) {
   return r;
 }
 
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
 /**
  * The low-level representation of a tensor, which contains a storage
  * (which contains the actual data) and metadata (e.g., sizes and strides)
@@ -291,13 +282,13 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   virtual void set_size(int64_t dim, int64_t new_size) {
-    sizes_[dim] = new_size;
+    sizes_.at(dim) = new_size;
     refresh_numel();
     refresh_contiguous();
   }
 
   virtual void set_stride(int64_t dim, int64_t new_stride) {
-    strides_[dim] = new_stride;
+    strides_.at(dim) = new_stride;
     refresh_numel();
     refresh_contiguous();
   }
@@ -374,6 +365,10 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_.device_type();
   }
 
+  at::Device GetDevice() const {
+    return storage_.device();
+  }
+
   /**
    * The static context of a tensor intuitively represents the device
    * type of a tensor; e.g., a CPU tensor is associated with the
@@ -385,18 +380,6 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     return ::caffe2::get_static_context(device_type());
   }
 
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us, and then eliminate
-   * this method.
-   */
-  std::unique_ptr<at::BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
   /**
    * @brief Copies the data from a source tensor, with a contex provided to
    * carry out the underlying memcpy operation.  This method respects
@@ -438,8 +421,12 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
         // knows how to copy between CPU and that context
         if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) {
           if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type());
+            CreateContext(src.GetDevice())
+                ->CopyBytesToDevice(
+                    numel() * itemsize(),
+                    src.data(),
+                    raw_mutable_data(data_type_),
+                    device_type());
           } else {
             CAFFE_ENFORCE(
                 context->device_type() == src.device_type(),
@@ -451,8 +438,11 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
           // In case source context is CPU, and target context is non-CPU
           // We'll have to create a Context from target and perform the
           // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              numel() * itemsize(), src.data(), raw_mutable_data(data_type_));
+          CreateContext(GetDevice())
+              ->CopyBytesFromCPU(
+                  numel() * itemsize(),
+                  src.data(),
+                  raw_mutable_data(data_type_));
         }
       }
     }
@@ -874,14 +864,7 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   inline void update_to_contiguous_strides() {
-    strides_.resize(sizes_.size());
-    if (dim() > 0) {
-      int last_idx = dim() - 1;
-      strides_[last_idx] = 1;
-      for (auto i = last_idx - 1; i >= 0; --i) {
-        strides_[i] = strides_[i + 1] * std::max<int64_t>(sizes_[i + 1], 1);
-      }
-    }
+    strides_.resize(0);
     is_contiguous_ = true;
   }
 
 
@@ -20,4 +20,10 @@ static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wr
   return dim;
 }
 
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+// This is the "Caffe2" name
+static inline int canonical_axis_index_(int axis_index, int ndims) {
+  return maybe_wrap_dim(axis_index, ndims, false);
+}
+
 }
@@ -1,5 +1,16 @@
 #include <ATen/core/context_base.h>
 
+namespace at {
+
+C10_DEFINE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+} // namespace at
+
 namespace caffe2 {
 
 // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
 
@@ -6,11 +6,12 @@
 #include <memory>
 #include <unordered_map>
 
-#include <ATen/core/DeviceType.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/UniqueVoidPtr.h>
 #include <ATen/core/typeid.h>
-#include <ATen/core/ATenGeneral.h>
+#include <c10/util/Registry.h>
 
 namespace caffe2 {
 class Event;
@@ -31,11 +32,6 @@ class CAFFE2_API BaseStaticContext {
 
   virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
 
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(
-      const caffe2::DeviceOption&) = 0;
-
   virtual DeviceType GetDeviceType() = 0;
 
   /*
@@ -184,6 +180,22 @@ class CAFFE2_API BaseContext {
   }
 };
 
+// Context constructor registry
+C10_DECLARE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+#define REGISTER_CONTEXT(type, ...) \
+  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
+
+inline std::unique_ptr<at::BaseContext> CreateContext(
+    const at::Device& device) {
+  return at::ContextRegistry()->Create(device.type(), device);
+}
+
 } // namespace at
 
 namespace caffe2 {
 
@@ -107,16 +107,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # NB: As far as ezyang can tell, we don't *have* to codegen this,
 # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
 # the superclass.  But it doesn't seem to be harmful.
-#
-# TODO: self_ty is a hack to make things work for native methods which need to
-# take a dtype, but also need to dispatch differently for different types.
-# Eliminate it at some point.
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
-    const auto& self_ty = *this;
-    (void)self_ty;
-    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
+    ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${type_derived_call_actuals});
 }
 """)
 TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
@@ -1574,8 +1568,15 @@ def process_native(option):
                         TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
                 else:
                     option['native_type_method_dispatch'] = native_dispatch
+                    type_derived_call_actuals = []
+                    for actual, arg in zip(option['actuals'], option['arguments']):
+                        if arg.get('is_type_dispatched', False):
+                            type_derived_call_actuals.append('*this')
+                        else:
+                            type_derived_call_actuals.append(actual)
                     type_object_definitions.append(
-                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env))
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(
+                            env, type_derived_call_actuals=type_derived_call_actuals))
 
     for declaration in declarations:
         for option in declaration['options']:
 
@@ -2068,8 +2068,8 @@
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
-# This "raw copy" doesn't handle conversions NOR does it handle non-blocking.
-- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor
+- func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor
+  variants: function
   dispatch:
     SparseCPU: copy_sparse_
     SparseCUDA: copy_sparse_
 
@@ -204,7 +204,7 @@ SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Te
 
 SparseTensor clone_sparse(const SparseTensor& self) {
   SparseTensor other = new_with_dims_and_size_sparse(self.type(), self._sparseDims(), self._denseDims(), self.sizes());
-  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values());
+  _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values(), true);
   _get_sparse_impl(other)->set_coalesced(self.is_coalesced());
   return other;
 }
@@ -243,11 +243,11 @@ Tensor sparse_to_dense(const SparseTensor& self) {
   return dst.add_(self);
 }
 
-SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) {
+SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src, bool non_blocking) {
   if (isSameTensor(self, src)) return self;
   _get_sparse_impl(self)->resize_(src._sparseDims(), src._denseDims(), src.sizes());
   // NB: This seems to copy the underlying full indices/values buffer
-  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values());
+  _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values(), non_blocking);
   _get_sparse_impl(self)->set_coalesced(src.is_coalesced());
   return self;
 }
 
@@ -98,7 +98,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
       r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   }
   else {
-    r = raw_copy_sparse_(r, t.coalesce());
+    copy_sparse_to_sparse_(r, t.coalesce());
   }
   r._values().log1p_();
   return r;
@@ -192,7 +192,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r, t);
+    return copy_sparse_to_sparse_(r, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r, src, value);
 
@@ -50,8 +50,8 @@ inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indic
 
 // Take indices and values and makes a (data) copy of them to put into the sparse
 // indices/values.  This used to be called THSTensor_(_set)
-inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {
-  _alias_into_sparse(self, indices.clone(), values.clone());
+inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {
+  _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));
 }
 
 // Does NOT make copies of indices/values
 
@@ -348,7 +348,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
   AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
-    return raw_copy_sparse_(r_, t);
+    return copy_sparse_to_sparse_(r_, t);
   }
   if (t._nnz() == 0) {
     return mul_out_sparse_scalar(r_, src, value);
 
@@ -18,7 +18,8 @@ namespace at {
 
 Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   Tensor b_src;
-  std::tie(b_src) = expand_inplace(self, src, "copy");
+  if (is_sparse()) b_src = src;
+  else std::tie(b_src) = expand_inplace(self, src, "copy");
   return s_copy_(self, b_src, non_blocking);
 }
 
@@ -28,19 +29,11 @@ Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device>
     device_guard.set_index(to_device.value().index());
   }
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
-  if (is_sparse()) {
-    auto indices = src._indices();
-    auto values = src._values();
-    auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU);
-    auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long);
-    auto indices_copy = this_dense_idx.copy(indices, non_blocking);
-    auto values_copy = this_dense.copy(values, non_blocking);
-    return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes());
-  } else {
-    Tensor r = this->tensor(src.sizes());
-    r.copy_(src, non_blocking);
-    return r;
-  }
+  Tensor r;
+  if (is_sparse()) r = this->native_tensor();
+  else r = this->tensor(src.sizes());
+  r.copy_(src, non_blocking);
+  return r;
 }
 
 void TypeDefault::backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
Original file line number	Diff line number	Diff line change
`@@ -20,4 +20,10 @@ static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wr`
`20`	`20`	`return dim;`
`21`	`21`	`}`
`22`	`22`
	`23`	`+// Wrap around axis_index if it is negative, s.t., -1 is the last dim`
	`24`	`+// This is the "Caffe2" name`
	`25`	`+static inline int canonical_axis_index_(int axis_index, int ndims) {`
	`26`	`+ return maybe_wrap_dim(axis_index, ndims, false);`
	`27`	`+}`
	`28`	`+`
`23`	`29`	`}`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {`
`98`	`98`	`r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");`
`99`	`99`	`}`
`100`	`100`	`else {`
`101`		`- r = raw_copy_sparse_(r, t.coalesce());`
	`101`	`+ copy_sparse_to_sparse_(r, t.coalesce());`
`102`	`102`	`}`
`103`	`103`	`r._values().log1p_();`
`104`	`104`	`return r;`
`@@ -192,7 +192,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S`
`192`	`192`	`AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());`
`193`	`193`
`194`	`194`	`if (src._nnz() == 0) {`
`195`		`- return raw_copy_sparse_(r, t);`
	`195`	`+ return copy_sparse_to_sparse_(r, t);`
`196`	`196`	`}`
`197`	`197`	`if (t._nnz() == 0) {`
`198`	`198`	`return mul_out_sparse_scalar(r, src, value);`
Original file line number	Diff line number	Diff line change
`@@ -50,8 +50,8 @@ inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indic`
`50`	`50`
`51`	`51`	`// Take indices and values and makes a (data) copy of them to put into the sparse`
`52`	`52`	`// indices/values. This used to be called THSTensor_(_set)`
`53`		`-inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) {`
`54`		`- _alias_into_sparse(self, indices.clone(), values.clone());`
	`53`	`+inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) {`
	`54`	`+ _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking));`
`55`	`55`	`}`
`56`	`56`
`57`	`57`	`// Does NOT make copies of indices/values`
Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const`
`348`	`348`	`AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());`
`349`	`349`
`350`	`350`	`if (src._nnz() == 0) {`
`351`		`- return raw_copy_sparse_(r_, t);`
	`351`	`+ return copy_sparse_to_sparse_(r_, t);`
`352`	`352`	`}`
`353`	`353`	`if (t._nnz() == 0) {`
`354`	`354`	`return mul_out_sparse_scalar(r_, src, value);`