ROCm · iotamudelta · Jul 3, 2018 · Jul 2, 2018 · Jul 2, 2018 · Jul 2, 2018
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
@@ -67,7 +67,7 @@ class THLongStorageView {
     storage.scalar_type = at::CTypeToScalarType<th::from_type<int64_t>>::to();
     storage.refcount = 0;
     storage.flag = 0;
-    storage.allocator = nullptr;
+    storage.allocatorVoidPtr = nullptr;
     storage.allocatorContext = nullptr;
   }
 private:

diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp b/aten/src/ATen/cuda/PinnedMemoryAllocator.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Config.h>
 
 #include <THC/THC.h>
+#include <THC/THCGeneral.hpp>
 
 #include <stdexcept>
 

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -10,6 +10,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 
 #include "THC/THC.h"
+#include <THC/THCGeneral.hpp>
 
 #if AT_CUDNN_ENABLED()
 #include "ATen/cudnn/cudnn-wrapper.h"

diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
@@ -13,7 +13,7 @@ namespace at {
 ${Storage}::${Storage}(Context* context):
     storage(${THStorage}_new(${state})), context(context) {}
 
-${Storage}::${Storage}(Context* context, ${THStorage}* storage):
+${Storage}::${Storage}(Context* context, THStorage* storage):
     storage(storage), context(context) {}
 
 ${Storage}::${Storage}(Context* context, size_t storage_size)

diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
@@ -16,7 +16,7 @@ struct Allocator;
 struct ${Storage} final : public Storage {
 public:
   explicit ${Storage}(Context* context);
-  ${Storage}(Context* context, ${THStorage} *wrapped);
+  ${Storage}(Context* context, THStorage *wrapped);
   ${Storage}(Context* context, size_t size);
   ${Storage}(Context* context, size_t size, Allocator* allocator);
   ${Storage}(Context* context,
@@ -50,7 +50,7 @@ struct ${Storage} final : public Storage {
 
 protected:
   friend struct ${Type};
-  ${THStorage} *storage;
+  THStorage *storage;
   Context* context;
 };
 

diff --git a/aten/src/TH/THStorage.cpp b/aten/src/TH/THStorage.cpp
@@ -1,3 +1,5 @@
+#include <climits>
+
 #include "THStorage.hpp"
 
 #include "generic/THStorage.cpp"
@@ -13,6 +15,8 @@
 #include "THGenerateHalfType.h"
 
 void THStorage_free(THStorage *storage) {
+  AT_ASSERT(storage->backend == at::kCPU);
+
   if(!storage)
     return;
 
@@ -21,7 +25,7 @@ void THStorage_free(THStorage *storage) {
     if(--storage->refcount == 0)
     {
       if(storage->flag & TH_STORAGE_FREEMEM) {
-        storage->allocator->free(storage->allocatorContext, storage->data_ptr);
+        static_cast<THAllocator*>(storage->allocatorVoidPtr)->free(storage->allocatorContext, storage->data_ptr);
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_free(storage->view);
@@ -65,3 +69,30 @@ THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElemen
   }
   return copy;
 }
+
+THStorage* THStorage_new(at::ScalarType scalar_type)
+{
+  return THStorage_newWithSize(scalar_type, 0);
+}
+
+THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size)
+{
+  return THStorage_newWithAllocator(scalar_type, size, &THDefaultAllocator, nullptr);
+}
+
+THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
+                                      THAllocator *allocator,
+                                      void *allocatorContext)
+{
+  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
+  storage->backend = at::kCPU;
+  storage->scalar_type = scalar_type;
+  storage->data_ptr = allocator->malloc(allocatorContext, at::elementSize(scalar_type)*size);
+  storage->size = size;
+  new (&storage->refcount) std::atomic<int>(1);
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocatorVoidPtr = allocator;
+  storage->allocatorContext = allocatorContext;
+  storage->device = INT_MIN;  // device is not meaningful on CPU
+  return storage;
+}
diff --git a/aten/src/TH/THStorage.hpp b/aten/src/TH/THStorage.hpp
@@ -5,21 +5,23 @@
 
 #include "THStorage.h"
 
-#include "ATen/ScalarType.h"
-#include "ATen/ScalarTypeUtils.h"
+#include <ATen/ScalarType.h>
+#include <ATen/ScalarTypeUtils.h>
 #include "THTypeConversion.hpp"
 #include <atomic>
 
 typedef struct THStorage
 {
+    at::Backend backend; // kCPU or kCUDA only
     at::ScalarType scalar_type;
     void *data_ptr;
     ptrdiff_t size;
     std::atomic<int> refcount;
     char flag;
-    THAllocator *allocator;
+    void *allocatorVoidPtr; // Either THDeviceAllocator or THCDeviceAllocator
     void *allocatorContext;
     struct THStorage *view;
+    int device;
 
     template <typename T>
     inline T * data() const {
@@ -36,3 +38,9 @@ typedef struct THStorage
       return static_cast<T*>(this->data_ptr);
     }
 } THStorage;
+
+TH_API THStorage* THStorage_new(at::ScalarType scalar_type);
+TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size);
+TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
+                                             THAllocator *allocator,
+                                             void *allocatorContext);
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
@@ -21,29 +21,22 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_(newWithSize)(0);
+  return THStorage_new(at::CTypeToScalarType<th::from_type<real>>::to());
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
-  return THStorage_(newWithAllocator)(size, &THDefaultAllocator, NULL);
+  return THStorage_newWithSize(at::CTypeToScalarType<th::from_type<real>>::to(), size);
 }
 
 THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         THAllocator *allocator,
                                         void *allocatorContext)
 {
-  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
-  storage->scalar_type = at::CTypeToScalarType<th::from_type<real>>::to();
-  storage->data_ptr = allocator->malloc(allocatorContext, sizeof(real)*size);
-  storage->size = size;
-  new (&storage->refcount) std::atomic<int>(1);
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
-  storage->allocator = allocator;
-  storage->allocatorContext = allocatorContext;
-  return storage;
+  return THStorage_newWithAllocator(at::CTypeToScalarType<th::from_type<real>>::to(), size, allocator, allocatorContext);
 }
 
+
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
   THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
@@ -142,28 +135,34 @@ THStorage* THStorage_(newWithDataAndAllocator)(real* data, ptrdiff_t size,
                                                THAllocator* allocator,
                                                void* allocatorContext) {
   THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
+  storage->backend = at::kCPU;
   storage->scalar_type = at::CTypeToScalarType<th::from_type<real>>::to();
   storage->data_ptr = data;
   storage->size = size;
   storage->refcount = 1;
   storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
-  storage->allocator = allocator;
+  storage->allocatorVoidPtr = allocator;
   storage->allocatorContext = allocatorContext;
+  storage->device = 0;
   return storage;
 }
 
 void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
 {
+  AT_ASSERT(storage->backend == at::kCPU);
+
+  auto* th_allocator = static_cast<THAllocator*>(storage->allocatorVoidPtr);
+
   if(storage->flag & TH_STORAGE_RESIZABLE)
   {
-    if(storage->allocator->realloc == NULL) {
+    if(th_allocator->realloc == NULL) {
       /* case when the allocator does not have a realloc defined */
       real *old_data = THStorage_(data)(storage);
       ptrdiff_t old_size = storage->size;
       if (size == 0) {
         storage->data_ptr = NULL;
       } else {
-        storage->data_ptr = storage->allocator->malloc(
+        storage->data_ptr = th_allocator->malloc(
             storage->allocatorContext,
             sizeof(real)*size);
       }
@@ -176,10 +175,10 @@ void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
         if (copy_size > 0) {
           memcpy(THStorage_(data)(storage), old_data, sizeof(real)*copy_size);
         }
-        storage->allocator->free(storage->allocatorContext, old_data);
+        th_allocator->free(storage->allocatorContext, old_data);
       }
     } else {
-      storage->data_ptr = storage->allocator->realloc(
+      storage->data_ptr = th_allocator->realloc(
               storage->allocatorContext,
               THStorage_(data)(storage),
               sizeof(real)*size);
@@ -215,17 +214,19 @@ void THStorage_(swap)(THStorage *storage1, THStorage *storage2)
     void *data_ptr;
     ptrdiff_t size;
     char flag;
-    THAllocator *allocator;
+    void *allocatorVoidPtr;
     void *allocatorContext;
     struct THStorage *view;
+    int device;
 
     SWAP(data_ptr);
     SWAP(size);
     SWAP(flag);
     // don't swap refcount!
-    SWAP(allocator);
+    SWAP(allocatorVoidPtr);
     SWAP(allocatorContext);
     SWAP(view);
+    SWAP(device);
 #undef SWAP
 }
 

diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
@@ -6,6 +6,7 @@
 #include "THCStream.h"
 #include "THCThreadLocal.h"
 #include "THCTensorRandom.h"
+#include "THCGeneral.hpp"
 #include <stdlib.h>
 #include <stdint.h>
 

diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
@@ -47,6 +47,7 @@
 struct THCRNGState;  /* Random number generator state. */
 typedef struct THCStream THCStream;
 typedef struct THCState THCState;
+struct THCState;
 
 typedef struct _THCDeviceAllocator {
    cudaError_t (*malloc)( void*, void**, size_t,         cudaStream_t);
@@ -70,54 +71,6 @@ typedef struct _THCCudaResourcesPerDevice {
   size_t scratchSpacePerStream;
 } THCCudaResourcesPerDevice;
 
-
-/* Global state to be held in the cutorch table. */
-struct THCState {
-  struct THCRNGState* rngState;
-  struct cudaDeviceProp* deviceProperties;
-  /* Set of all allocated resources. blasHandles and sparseHandles do not have
-     a default and must be explicitly initialized. We always initialize 1
-     blasHandle and 1 sparseHandle but we can use more.
-  */
-  THCCudaResourcesPerDevice* resourcesPerDevice;
-  /* Captured number of devices upon startup; convenience for bounds checking */
-  int numDevices;
-  int numUserBlasHandles;
-  int numUserSparseHandles;
-
-  /* Allocator using cudaMallocHost. */
-  THAllocator* cudaHostAllocator;
-  THAllocator* cudaUVAAllocator;
-  THCDeviceAllocator* cudaDeviceAllocator;
-
-  /* Index of the current selected BLAS handle. The actual BLAS handle used
-     depends on the current device. */
-  THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
-  /* Index of the current selected sparse handle. The actual sparse handle used
-     depends on the current device. */
-  THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
-  /* Array of thread locals containing the current stream for each device */
-  THCThreadLocal* currentStreams;
-
-  /* Table of enabled peer-to-peer access between directed pairs of GPUs.
-     If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
-  int** p2pAccessEnabled;
-
-  /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
-     copies are allowed via p2p if p2p access is enabled at all for
-     the pair of GPUs in question, but if this flag is true, then
-     all cross-GPU access checks are disabled, allowing kernels to
-     directly access memory on another GPUs.
-     Note that p2p access must exist and be enabled for the pair of
-     GPUs in question. */
-  int p2pKernelAccessEnabled;
-
-  void (*cutorchGCFunction)(void *data);
-  void *cutorchGCData;
-  ptrdiff_t heapSoftmax;
-  ptrdiff_t heapDelta;
-};
-
 THC_API THCState* THCState_alloc(void);
 THC_API void THCState_free(THCState* state);
 

diff --git a/aten/src/THC/THCGeneral.hpp b/aten/src/THC/THCGeneral.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "THCGeneral.h"
+
+/* Global state of THC. */
+struct THCState {
+  struct THCRNGState* rngState;
+  struct cudaDeviceProp* deviceProperties;
+  /* Set of all allocated resources. blasHandles and sparseHandles do not have
+     a default and must be explicitly initialized. We always initialize 1
+     blasHandle and 1 sparseHandle but we can use more.
+  */
+  THCCudaResourcesPerDevice* resourcesPerDevice;
+  /* Captured number of devices upon startup; convenience for bounds checking */
+  int numDevices;
+  int numUserBlasHandles;
+  int numUserSparseHandles;
+
+  /* Allocator using cudaMallocHost. */
+  THAllocator* cudaHostAllocator;
+  THAllocator* cudaUVAAllocator;
+  THCDeviceAllocator* cudaDeviceAllocator;
+
+  /* Index of the current selected BLAS handle. The actual BLAS handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+  /* Index of the current selected sparse handle. The actual sparse handle used
+     depends on the current device. */
+  THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
+  /* Array of thread locals containing the current stream for each device */
+  THCThreadLocal* currentStreams;
+
+  /* Table of enabled peer-to-peer access between directed pairs of GPUs.
+     If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
+  int** p2pAccessEnabled;
+
+  /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
+     copies are allowed via p2p if p2p access is enabled at all for
+     the pair of GPUs in question, but if this flag is true, then
+     all cross-GPU access checks are disabled, allowing kernels to
+     directly access memory on another GPUs.
+     Note that p2p access must exist and be enabled for the pair of
+     GPUs in question. */
+  int p2pKernelAccessEnabled;
+
+  void (*cutorchGCFunction)(void *data);
+  void *cutorchGCData;
+  ptrdiff_t heapSoftmax;
+  ptrdiff_t heapDelta;
+};