ROCm · jithunnair-amd · Jul 2, 2018 · whchung · Jul 2, 2018 · whchung
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -10,7 +10,7 @@
 
 #if CUDNN_VERSION < 7000
 
-#include <curand_kernel.h>
+//#include <curand_kernel.h>
 
 /*
 Note [cuDNN dropout descriptor initialization]
@@ -233,7 +233,8 @@ inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
   if (ret != CUDNN_STATUS_SUCCESS) return ret;
   if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
   dropoutDesc->dropout = dropout;
-  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+//  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+  dropoutDesc->nstates = (int)stateSizeInBytes;
   dropoutDesc->states = states;
   return CUDNN_STATUS_SUCCESS;
 }

diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
@@ -3,9 +3,9 @@
 #include "ATen/cuda/CUDAApplyUtils.cuh"
 #include "ATen/AccumulateType.h"
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
+//#include <curand.h>
+//#include <curand_kernel.h>
+//#include <curand_philox4x32_x.h>
 #include <utility>
 #include <functional>
 #include <nvfunctional>

diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
@@ -180,15 +180,15 @@ __global__ void renorm_kernel(
     } else if (norm_type == 2) {
       v += x * x;
     } else {
-      v += std::pow(x, norm_type);
+      //v += std::pow(x, norm_type);
     }
   }
 
   using Op = ReduceAdd<accscalar_t>;
   v = reduceBlock<accscalar_t>(sdata, blockDim.x, v, Op(), 0);
 
   if (tid == 0) {
-    sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
+    //sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
   }
   __syncthreads();
 

diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
@@ -46,17 +46,17 @@ void magmaGesvBatched<double>(
       dB_array, lddb, dinfo_array, batch_count, queue);
 }
 
-static magma_queue_t createMagmaQueue(const Tensor& tensor) {
-  auto& context = tensor.type().get_context();
-  magma_queue_t magma_queue;
-  magma_queue_create_from_cuda(
-      tensor.get_device(),
-      context.getCurrentCUDAStream(),
-      THCState_getCurrentBlasHandle(context.getTHCState()),
-      THCState_getCurrentSparseHandle(context.getTHCState()),
-      &magma_queue);
-  return magma_queue;
-}
+//static magma_queue_t createMagmaQueue(const Tensor& tensor) {
+//  auto& context = tensor.type().get_context();
+//  magma_queue_t magma_queue;
+//  magma_queue_create_from_cuda(
+//      tensor.get_device(),
+//      context.getCurrentCUDAStream(),
+//      THCState_getCurrentBlasHandle(context.getTHCState()),
+//      THCState_getCurrentSparseHandle(context.getTHCState()),
+//      &magma_queue);
+//  return magma_queue;
+//}
 
 static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<magma_int_t>(value);
@@ -116,9 +116,9 @@ AT_ERROR("gesv: MAGMA library not found in "
     ipiv_array[i] = &ipiv_data[i * n];
   }
 
-  magmaGesvBatched<scalar_t>(
-      n, nrhs, A_array, n, ipiv_array, b_array, n,
-      info_array, batch_size, createMagmaQueue(b));
+//  magmaGesvBatched<scalar_t>(
+//      n, nrhs, A_array, n, ipiv_array, b_array, n,
+//      info_array, batch_size, createMagmaQueue(b));
 
   for (int64_t i = 0; i < batch_size; i++) {
     infos[i] = info_array[i];

diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -18,7 +18,7 @@ namespace {
 template<typename T, typename AccumT>
 struct LogSoftMaxForwardEpilogue {
   __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
-    : logsum(max_input + std::log(sum)) {}
+    : logsum(max_input /*+ std::log(sum)*/ ) {}
 
   __device__ __forceinline__ T operator()(T input) const {
     return static_cast<T>(input - logsum);
@@ -33,7 +33,7 @@ struct LogSoftMaxBackwardEpilogue {
     : sum(sum) {}
 
   __device__ __forceinline__ T operator()(T gradOutput, T output) const {
-    return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
+    return static_cast<T>(gradOutput /*- std::exp(static_cast<AccumT>(output)) * sum */ );
   }
 
   const AccumT sum;
@@ -46,7 +46,7 @@ struct SoftMaxForwardEpilogue {
     , sum(sum) {}
 
   __device__ __forceinline__ T operator()(T input) const {
-    return static_cast<T>(std::exp(input - max_input) / sum);
+    return static_cast<T>(0); // std::exp(input - max_input) / sum);
   }
 
   const AccumT max_input;
@@ -203,9 +203,9 @@ __global__ void cunn_SpatialSoftMaxForward(
         max_input = spatialBlockReduceX<accscalar_t, Max>(sdata,max_input);
 
         accscalar_t sum = 0;
-        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
-                 - max_input);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {}
+          //sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+          //       - max_input);
         sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
 
         Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
@@ -218,9 +218,9 @@ __global__ void cunn_SpatialSoftMaxForward(
           max_input = Max<accscalar_t>()(max_input, value);
         }
         accscalar_t sum = 0;
-        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
-                 - max_input);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {}
+          //sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+          //       - max_input);
         Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
         for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
           output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
@@ -284,7 +284,7 @@ template <typename T, typename AccumT>
 struct MaxFloat
 {
   __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
-    return ::max(max, (AccumT)v);
+    return /*::max(max,*/ (AccumT)v /*)*/ ;
   }
 };
 
@@ -303,7 +303,7 @@ struct SumExpFloat
     : max_k(v) {}
 
   __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
-    return sum + std::exp(v - max_k);
+    return sum; // + std::exp(v - max_k);
   }
 
   const AccumT max_k;

diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
@@ -173,12 +173,12 @@ void THCudaShutdown(THCState* state)
       THCublasCheck(cublasDestroy(res->blasHandles[i]));
     }
     /* Free user defined sparse handles */
-    for (int i = 0; i < res->numSparseHandles; ++i) {
-      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
-    }
+//    for (int i = 0; i < res->numSparseHandles; ++i) {
+//      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+//    }
 
     free(res->blasHandles);
-    free(res->sparseHandles);
+//    free(res->sparseHandles);
     THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
     THCThreadLocal_free(state->currentStreams[dev]);
   }
@@ -354,14 +354,14 @@ void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSpa
   THCudaCheck(cudaGetDevice(&prevDev));
   THCudaCheck(cudaSetDevice(device));
 
-  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
-  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
-  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
-    handles[i] = NULL;
-    THCusparseCheck(cusparseCreate(&handles[i]));
-  }
-  res->sparseHandles = handles;
-  res->numSparseHandles = numSparseHandles;
+//  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+//  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+//  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+//    handles[i] = NULL;
+//    THCusparseCheck(cusparseCreate(&handles[i]));
+//  }
+//  res->sparseHandles = handles;
+//  res->numSparseHandles = numSparseHandles;
 
   THCudaCheck(cudaSetDevice(prevDev));
 }
@@ -419,16 +419,16 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han
   return res->blasHandles[handle - 1];
 }
 
-cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
-{
-  if (handle <= 0 || handle > state->numUserSparseHandles) {
-    THError("%d is not a valid handle, valid range is: (1, %d)",
-            handle, state->numUserSparseHandles);
-  }
-  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
-  THCState_reserveDeviceSparseHandles(state, device, handle);
-  return res->sparseHandles[handle - 1];
-}
+//cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+//{
+//  if (handle <= 0 || handle > state->numUserSparseHandles) {
+//    THError("%d is not a valid handle, valid range is: (1, %d)",
+//            handle, state->numUserSparseHandles);
+//  }
+//  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+//  THCState_reserveDeviceSparseHandles(state, device, handle);
+//  return res->sparseHandles[handle - 1];
+//}
 
 static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
 {
@@ -493,21 +493,21 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
   return NULL;
 }
 
-cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
-{
-  /* This is called at the point of kernel execution.
-     For some debugging code or improperly instrumented kernels,
-     `state` is null */
-  if (state) {
-    int device;
-    THCudaCheck(cudaGetDevice(&device));
-
-    int handle = THCState_getCurrentSparseHandleIndex(state);
-    return THCState_getDeviceSparseHandle(state, device, handle);
-  }
-  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
-  return NULL;
-}
+//cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+//{
+//  /* This is called at the point of kernel execution.
+//     For some debugging code or improperly instrumented kernels,
+//     `state` is null */
+//  if (state) {
+//    int device;
+//    THCudaCheck(cudaGetDevice(&device));
+//
+//    int handle = THCState_getCurrentSparseHandleIndex(state);
+//    return THCState_getDeviceSparseHandle(state, device, handle);
+//  }
+//  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+//  return NULL;
+//}
 
 int THCState_getCurrentBlasHandleIndex(THCState *state)
 {
@@ -643,54 +643,54 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
   }
 }
 
-void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
-{
-  if(status != CUSPARSE_STATUS_SUCCESS)
-  {
-    const char* errmsg = NULL;
-
-    switch(status)
-    {
-      case CUSPARSE_STATUS_NOT_INITIALIZED:
-        errmsg = "library not initialized";
-        break;
-
-      case CUSPARSE_STATUS_ALLOC_FAILED:
-        errmsg = "resource allocation failed";
-        break;
-
-      case CUSPARSE_STATUS_INVALID_VALUE:
-        errmsg = "an invalid numeric value was used as an argument";
-        break;
-
-      case CUSPARSE_STATUS_ARCH_MISMATCH:
-        errmsg = "an absent device architectural feature is required";
-        break;
-
-      case CUSPARSE_STATUS_MAPPING_ERROR:
-        errmsg = "an access to GPU memory space failed";
-        break;
-
-      case CUSPARSE_STATUS_EXECUTION_FAILED:
-        errmsg = "the GPU program failed to execute";
-        break;
-
-      case CUSPARSE_STATUS_INTERNAL_ERROR:
-        errmsg = "an internal operation failed";
-        break;
-
-      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-        errmsg = "the matrix type is not supported by this function";
-        break;
-
-      default:
-        errmsg = "unknown error";
-        break;
-    }
-
-    _THError(file, line, "cusparse runtime error : %s", errmsg);
-  }
-}
+//void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+//{
+//  if(status != CUSPARSE_STATUS_SUCCESS)
+//  {
+//    const char* errmsg = NULL;
+//
+//    switch(status)
+//    {
+//      case CUSPARSE_STATUS_NOT_INITIALIZED:
+//        errmsg = "library not initialized";
+//        break;
+//
+//      case CUSPARSE_STATUS_ALLOC_FAILED:
+//        errmsg = "resource allocation failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_INVALID_VALUE:
+//        errmsg = "an invalid numeric value was used as an argument";
+//        break;
+//
+//      case CUSPARSE_STATUS_ARCH_MISMATCH:
+//        errmsg = "an absent device architectural feature is required";
+//        break;
+//
+//      case CUSPARSE_STATUS_MAPPING_ERROR:
+//        errmsg = "an access to GPU memory space failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_EXECUTION_FAILED:
+//        errmsg = "the GPU program failed to execute";
+//        break;
+//
+//      case CUSPARSE_STATUS_INTERNAL_ERROR:
+//        errmsg = "an internal operation failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+//        errmsg = "the matrix type is not supported by this function";
+//        break;
+//
+//      default:
+//        errmsg = "unknown error";
+//        break;
+//    }
+//
+//    _THError(file, line, "cusparse runtime error : %s", errmsg);
+//  }
+//}
 
 void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
 {