ROCm
diff --git a/‎aten/src/ATen/CPUTypeDefault.cpp
Lines changed: 20 additions & 0 deletions b/‎aten/src/ATen/CPUTypeDefault.cpp
Lines changed: 20 additions & 0 deletions
diff --git a/‎aten/src/ATen/CPUTypeDefault.h
Lines changed: 14 additions & 0 deletions b/‎aten/src/ATen/CPUTypeDefault.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.cpp
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/Context.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/Context.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/UndefinedType.cpp
Lines changed: 10 additions & 8 deletions b/‎aten/src/ATen/UndefinedType.cpp
Lines changed: 10 additions & 8 deletions
diff --git a/‎aten/src/ATen/UndefinedType.h
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/UndefinedType.h
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAContext.cpp
Lines changed: 6 additions & 2 deletions b/‎aten/src/ATen/cuda/CUDAContext.cpp
Lines changed: 6 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAContext.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAContext.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDADevice.h
Lines changed: 16 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDADevice.h
Lines changed: 16 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDATypeDefault.cpp
Lines changed: 19 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDATypeDefault.cpp
Lines changed: 19 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDATypeDefault.h
Lines changed: 16 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDATypeDefault.h
Lines changed: 16 additions & 0 deletions
diff --git a/‎aten/src/ATen/cudnn/Descriptors.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cudnn/Descriptors.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/gen.py
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/gen.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/Linear.cpp
Lines changed: 58 additions & 0 deletions b/‎aten/src/ATen/native/Linear.cpp
Lines changed: 58 additions & 0 deletions
@@ -0,0 +1,20 @@
+#include <ATen/CPUTypeDefault.h>
+
+#include <ATen/Context.h>
+#include <ATen/CPUGenerator.h>
+
+namespace at {
+
+Allocator* CPUTypeDefault::allocator() const {
+  return getCPUAllocator();
+}
+
+Device CPUTypeDefault::getDeviceFromPtr(void * data) const {
+  return DeviceType::CPU;
+}
+
+std::unique_ptr<Generator> CPUTypeDefault::generator() const {
+  return std::unique_ptr<Generator>(new CPUGenerator(&at::globalContext()));
+}
+
+} // namespace at
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/TypeDefault.h>
+
+namespace at {
+
+struct AT_API CPUTypeDefault : public TypeDefault {
+  CPUTypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : TypeDefault(type_id, is_variable, is_undefined) {}
+  Allocator* allocator() const override;
+  Device getDeviceFromPtr(void * data) const override;
+  std::unique_ptr<Generator> generator() const override;
+};
+
+} // namespace at
@@ -118,4 +118,8 @@ Type& getMaybeVariableType(const TensorImpl* impl) {
             backend, impl->scalar_type(), impl->is_variable());
 }
 
+Allocator* getCPUAllocator() {
+  return getTHDefaultAllocator();
+}
+
 }
@@ -158,6 +158,8 @@ static inline Type& getNonVariableType(DeviceType p, ScalarType s) {
 AT_API Type& getMaybeVariableType(TensorOptions options);
 AT_API Type& getMaybeVariableType(const TensorImpl*);
 
+AT_API Allocator* getCPUAllocator();
+
 static inline Type& CPU(ScalarType s) {
   return getNonVariableType(Backend::CPU, s);
 }
 
@@ -11,9 +11,14 @@ ScalarType UndefinedType::scalarType() const {
 Backend UndefinedType::backend() const {
   return Backend::Undefined;
 }
-bool UndefinedType::is_cuda() const { return false; }
-bool UndefinedType::is_sparse() const { return false; }
-bool UndefinedType::is_distributed() const { return false; }
+
+Allocator* UndefinedType::allocator() const {
+  AT_ERROR("allocator not defined for UndefinedType");
+}
+
+Device UndefinedType::getDeviceFromPtr(void*) const {
+  AT_ERROR("getDeviceFromPtr not defined for UndefinedType");
+}
 
 Storage UndefinedType::storage(bool resizable) const {
   AT_ERROR("storage not defined for UndefinedType");
@@ -38,8 +43,9 @@ std::unique_ptr<Generator> UndefinedType::generator() const {
 }
 
 const char * UndefinedType::toString() const {
-  return UndefinedType::typeString();
+  return "UndefinedType";
 }
+
 TypeID UndefinedType::ID() const {
   return TypeID::Undefined;
 }
@@ -61,10 +67,6 @@ Type & UndefinedType::toScalarType(ScalarType s) const {
   AT_ERROR("toScalarType not implemented for UndefinedType to non-UndefinedType");
 }
 
-const char * UndefinedType::typeString() {
-  return "UndefinedType";
-}
-
 Tensor & UndefinedType::s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   AT_ERROR("s_copy not defined for UndefinedType");
 }
 
@@ -15,9 +15,8 @@ struct UndefinedType final : public TypeDefault {
   explicit UndefinedType();
   virtual ScalarType scalarType() const override;
   virtual Backend backend() const override;
-  virtual bool is_cuda() const override;
-  virtual bool is_sparse() const override;
-  virtual bool is_distributed() const override;
+  virtual Allocator* allocator() const override;
+  virtual Device getDeviceFromPtr(void* data) const override;
   virtual Storage storage(bool resizable = false) const override;
   virtual Storage storage(size_t size, bool resizable = false) const override;
   virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const override;
@@ -28,7 +27,6 @@ struct UndefinedType final : public TypeDefault {
   virtual Type & toBackend(Backend b) const override;
   virtual Type & toScalarType(ScalarType s) const override;
   virtual TypeID ID() const override;
-  static const char * typeString();
   virtual Storage unsafeStorageFromTH(void * th_pointer, bool retain) const override;
   virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
 
 
@@ -1,5 +1,5 @@
 #include "ATen/cuda/CUDAContext.h"
-#include "THC/THCGeneral.h"
+#include "THC/THCGeneral.hpp"
 
 namespace at { namespace cuda { 
 
@@ -45,6 +45,10 @@ void uncheckedSetCurrentCUDAStream(CUDAStream stream) {
   detail::CUDAStream_uncheckedSetStream(stream.internals());
 }
 
+Allocator* getCUDADeviceAllocator() {
+  return at::globalContext().getTHCState()->cudaDeviceAllocator;
+}
+
 /* Handles */
 #ifndef __HIP_PLATFORM_HCC__
   cusparseHandle_t getCurrentCUDASparseHandle() {
@@ -54,4 +58,4 @@ void uncheckedSetCurrentCUDAStream(CUDAStream stream) {
 
 } // namespace cuda
 
-} // namespace at
+} // namespace at
@@ -54,6 +54,8 @@ AT_API CUDAStream getCurrentCUDAStream(int64_t device = -1);
 AT_API void setCurrentCUDAStream(CUDAStream stream);
 AT_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 
+AT_API Allocator* getCUDADeviceAllocator();
+
 /* Handles */
 #ifndef __HIP_PLATFORM_HCC__
   AT_API cusparseHandle_t getCurrentCUDASparseHandle();
 
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ATen/cuda/Exceptions.h"
+
+#include "cuda.h"
+
+namespace at {
+namespace cuda {
+
+inline Device getDeviceFromPtr(void* ptr) {
+  struct cudaPointerAttributes attr;
+  AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+  return {DeviceType::CUDA, attr.device};
+}
+
+}} // namespace at::cuda
@@ -0,0 +1,19 @@
+#include <ATen/cuda/CUDATypeDefault.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADevice.h>
+#include <ATen/CUDAGenerator.h>
+
+namespace at {
+
+Allocator* CUDATypeDefault::allocator() const {
+  return cuda::getCUDADeviceAllocator();
+}
+Device CUDATypeDefault::getDeviceFromPtr(void * data) const {
+  return cuda::getDeviceFromPtr(data);
+}
+std::unique_ptr<Generator> CUDATypeDefault::generator() const {
+  return std::unique_ptr<Generator>(new CUDAGenerator(&at::globalContext()));
+}
+
+} // namespace at
@@ -0,0 +1,16 @@
+#pragma once
+#include <ATen/TypeDefault.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+
+namespace at {
+
+struct AT_CUDA_API CUDATypeDefault : public TypeDefault {
+  CUDATypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : TypeDefault(type_id, is_variable, is_undefined) {}
+
+  Allocator* allocator() const override;
+  Device getDeviceFromPtr(void * data) const override;
+  std::unique_ptr<Generator> generator() const override;
+};
+
+} // namespace at
@@ -257,7 +257,7 @@ struct AT_CUDA_API DropoutDescriptor
     AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));
     AT_ASSERT(type.is_cuda());
     AT_ASSERT(type.scalarType() == kByte);
-    state = at::empty({static_cast<int64_t>(state_size)}, type);
+    state = at::empty({static_cast<int64_t>(state_size)}, type.options());
     AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));
   }
 
 
@@ -256,6 +256,8 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
         ]
         env['extra_cuda_headers'] = ['#include <ATen/cuda/CUDAHalf.cuh>']
         env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')
+        env['extra_cuda_headers'].append('#include <ATen/cuda/CUDADevice.h>')
+        env['extra_cuda_headers'].append('#include <ATen/cuda/CUDATypeDefault.h>')
         sname = '' if scalar_name == "Float" else scalar_name
         env['THType'] = 'Cuda{}'.format(sname)
         env['THStorage'] = 'THCuda{}Storage'.format(sname)
 
@@ -457,4 +457,62 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight
   return output;
 }
 
+// implements tensordot, a matrix-multiplication-like contraction, but the dimensions given
+// in the two dimension lists
+Tensor tensordot(const Tensor& input1, const Tensor& input2, IntList dims1, IntList dims2) {
+  AT_CHECK(dims1.size() == dims2.size(), "both dimension lists should have same length");
+  int64_t csize = 1;  // total size of the contracted dimensions
+  Tensor t1 = input1;
+  Tensor t2 = input2;
+  for (size_t i = 0; i < dims1.size(); i++) {
+    int s1 = input1.size(dims1[i]);
+    int s2 = input2.size(dims2[i]);
+    if (s2 == 1) { // broadcasted dimensions can be summed right away
+      t1 = t1.sum(dims1[i], true);
+    } else if (s1 == 1) {
+      t2 = t2.sum(dims2[i], true);
+    } else {
+      AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
+	       " and second has size ", s2, " in dim ", dims2[i]);
+      csize *= s1;
+    }
+  }
+
+  auto cdims1 = dim_list_to_bitset(dims1, input1.dim());
+  auto cdims2 = dim_list_to_bitset(dims2, input2.dim());
+  std::vector<int64_t> p1, p2, rsizes;  // p1, p2: input permutations, rsizes: sizes of the result
+  p1.reserve(input1.dim());
+  p2.reserve(input2.dim());
+  rsizes.reserve(input1.dim() + input2.dim() - (int64_t) dims1.size());
+  int64_t size1 = 1; // number of non-contracted elements in input1
+  int64_t size2 = 1; // number of non-contracted elements in input2
+
+  // fill the permutations and compute sizes
+  for (int64_t i = 0; i < input1.dim(); i++) {
+    if (! cdims1[i]) {
+      p1.emplace_back(i);
+      size1 *= t1.size(i);
+      rsizes.emplace_back(t1.size(i));
+    }
+  }
+  for (size_t i = 0; i < dims1.size(); i++) {
+    p1.emplace_back(dims1[i]);
+  }
+  for (size_t i = 0; i < dims2.size(); i++) {
+    p2.emplace_back(dims2[i]);
+  }
+  for (int64_t i = 0; i < input2.dim(); i++) {
+    if (! cdims2[i]) {
+      p2.emplace_back(i);
+      size2 *= t2.size(i);
+      rsizes.emplace_back(t2.size(i));
+    }
+  }
+  // permut and reshape for matrix multiplication
+  t1 = t1.permute(p1).reshape({size1, csize});
+  t2 = t2.permute(p2).reshape({csize, size2});
+  // multiply and reshape to target size
+  return at::mm(t1, t2).reshape(rsizes);
+}
+
 }}  // namespace at::native
Original file line number	Diff line number	Diff line change
`@@ -118,4 +118,8 @@ Type& getMaybeVariableType(const TensorImpl* impl) {`
`118`	`118`	`backend, impl->scalar_type(), impl->is_variable());`
`119`	`119`	`}`
`120`	`120`
	`121`	`+Allocator* getCPUAllocator() {`
	`122`	`+ return getTHDefaultAllocator();`
	`123`	`+}`
	`124`	`+`
`121`	`125`	`}`
Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,8 @@ static inline Type& getNonVariableType(DeviceType p, ScalarType s) {`
`158`	`158`	`AT_API Type& getMaybeVariableType(TensorOptions options);`
`159`	`159`	`AT_API Type& getMaybeVariableType(const TensorImpl*);`
`160`	`160`
	`161`	`+AT_API Allocator* getCPUAllocator();`
	`162`	`+`
`161`	`163`	`static inline Type& CPU(ScalarType s) {`
`162`	`164`	`return getNonVariableType(Backend::CPU, s);`
`163`	`165`	`}`
Original file line number	Diff line number	Diff line change
`@@ -257,7 +257,7 @@ struct AT_CUDA_API DropoutDescriptor`
`257`	`257`	`AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));`
`258`	`258`	`AT_ASSERT(type.is_cuda());`
`259`	`259`	`AT_ASSERT(type.scalarType() == kByte);`
`260`		`- state = at::empty({static_cast<int64_t>(state_size)}, type);`
	`260`	`+ state = at::empty({static_cast<int64_t>(state_size)}, type.options());`
`261`	`261`	`AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));`
`262`	`262`	`}`
`263`	`263`
Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,8 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations`
`256`	`256`	`]`
`257`	`257`	`env['extra_cuda_headers'] = ['#include <ATen/cuda/CUDAHalf.cuh>']`
`258`	`258`	`env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')`
	`259`	`+ env['extra_cuda_headers'].append('#include <ATen/cuda/CUDADevice.h>')`
	`260`	`+ env['extra_cuda_headers'].append('#include <ATen/cuda/CUDATypeDefault.h>')`
`259`	`261`	`sname = '' if scalar_name == "Float" else scalar_name`
`260`	`262`	`env['THType'] = 'Cuda{}'.format(sname)`
`261`	`263`	`env['THStorage'] = 'THCuda{}Storage'.format(sname)`