diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 76c58b7db86bcd..31a23f5f172a1b 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -10,7 +10,7 @@ #if CUDNN_VERSION < 7000 -#include +//#include /* Note [cuDNN dropout descriptor initialization] @@ -233,7 +233,8 @@ inline cudnnStatus_t cudnnRestoreDropoutDescriptor( if (ret != CUDNN_STATUS_SUCCESS) return ret; if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE; dropoutDesc->dropout = dropout; - dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t); +// dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t); + dropoutDesc->nstates = (int)stateSizeInBytes; dropoutDesc->states = states; return CUDNN_STATUS_SUCCESS; } diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index c591a30a8aa0dd..155b652c10f73c 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -3,9 +3,9 @@ #include "ATen/cuda/CUDAApplyUtils.cuh" #include "ATen/AccumulateType.h" -#include -#include -#include +//#include +//#include +//#include #include #include #include diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 96f648c681ab2f..9760b0fb3a1db2 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -180,7 +180,7 @@ __global__ void renorm_kernel( } else if (norm_type == 2) { v += x * x; } else { - v += std::pow(x, norm_type); + //v += std::pow(x, norm_type); } } @@ -188,7 +188,7 @@ __global__ void renorm_kernel( v = reduceBlock(sdata, blockDim.x, v, Op(), 0); if (tid == 0) { - sdata[0] = std::pow(v, static_cast(1.0 / norm_type)); + //sdata[0] = std::pow(v, static_cast(1.0 / norm_type)); } __syncthreads(); diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index 31fe612f98c37b..e112ff6d56aa5f 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -46,17 +46,17 @@ void magmaGesvBatched( dB_array, lddb, dinfo_array, batch_count, queue); } -static magma_queue_t createMagmaQueue(const Tensor& tensor) { - auto& context = tensor.type().get_context(); - magma_queue_t magma_queue; - magma_queue_create_from_cuda( - tensor.get_device(), - context.getCurrentCUDAStream(), - THCState_getCurrentBlasHandle(context.getTHCState()), - THCState_getCurrentSparseHandle(context.getTHCState()), - &magma_queue); - return magma_queue; -} +//static magma_queue_t createMagmaQueue(const Tensor& tensor) { +// auto& context = tensor.type().get_context(); +// magma_queue_t magma_queue; +// magma_queue_create_from_cuda( +// tensor.get_device(), +// context.getCurrentCUDAStream(), +// THCState_getCurrentBlasHandle(context.getTHCState()), +// THCState_getCurrentSparseHandle(context.getTHCState()), +// &magma_queue); +// return magma_queue; +//} static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { auto result = static_cast(value); @@ -116,9 +116,9 @@ AT_ERROR("gesv: MAGMA library not found in " ipiv_array[i] = &ipiv_data[i * n]; } - magmaGesvBatched( - n, nrhs, A_array, n, ipiv_array, b_array, n, - info_array, batch_size, createMagmaQueue(b)); +// magmaGesvBatched( +// n, nrhs, A_array, n, ipiv_array, b_array, n, +// info_array, batch_size, createMagmaQueue(b)); for (int64_t i = 0; i < batch_size; i++) { infos[i] = info_array[i]; diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index d500a47320f333..22ef5f5831833c 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -18,7 +18,7 @@ namespace { template struct LogSoftMaxForwardEpilogue { __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum) - : logsum(max_input + std::log(sum)) {} + : logsum(max_input /*+ std::log(sum)*/ ) {} __device__ __forceinline__ T operator()(T input) const { return static_cast(input - logsum); @@ -33,7 +33,7 @@ struct LogSoftMaxBackwardEpilogue { : sum(sum) {} __device__ __forceinline__ T operator()(T gradOutput, T output) const { - return static_cast(gradOutput - std::exp(static_cast(output)) * sum); + return static_cast(gradOutput /*- std::exp(static_cast(output)) * sum */ ); } const AccumT sum; @@ -46,7 +46,7 @@ struct SoftMaxForwardEpilogue { , sum(sum) {} __device__ __forceinline__ T operator()(T input) const { - return static_cast(std::exp(input - max_input) / sum); + return static_cast(0); // std::exp(input - max_input) / sum); } const AccumT max_input; @@ -203,9 +203,9 @@ __global__ void cunn_SpatialSoftMaxForward( max_input = spatialBlockReduceX(sdata,max_input); accscalar_t sum = 0; - for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_input); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {} + //sum += std::exp(static_cast(input[data_offset + d * dim_stride]) + // - max_input); sum = spatialBlockReduceX(sdata, sum); Epilogue epilogue(max_input, sum); @@ -218,9 +218,9 @@ __global__ void cunn_SpatialSoftMaxForward( max_input = Max()(max_input, value); } accscalar_t sum = 0; - for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_input); + for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {} + //sum += std::exp(static_cast(input[data_offset + d * dim_stride]) + // - max_input); Epilogue epilogue(max_input, sum); for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]); @@ -284,7 +284,7 @@ template struct MaxFloat { __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { - return ::max(max, (AccumT)v); + return /*::max(max,*/ (AccumT)v /*)*/ ; } }; @@ -303,7 +303,7 @@ struct SumExpFloat : max_k(v) {} __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { - return sum + std::exp(v - max_k); + return sum; // + std::exp(v - max_k); } const AccumT max_k; diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 114b967f7d309f..34aeca1c23d49d 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -173,12 +173,12 @@ void THCudaShutdown(THCState* state) THCublasCheck(cublasDestroy(res->blasHandles[i])); } /* Free user defined sparse handles */ - for (int i = 0; i < res->numSparseHandles; ++i) { - THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); - } +// for (int i = 0; i < res->numSparseHandles; ++i) { +// THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); +// } free(res->blasHandles); - free(res->sparseHandles); +// free(res->sparseHandles); THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev])); THCThreadLocal_free(state->currentStreams[dev]); } @@ -354,14 +354,14 @@ void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSpa THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaSetDevice(device)); - size_t size = numSparseHandles * sizeof(cusparseHandle_t); - cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size); - for (int i = res->numSparseHandles; i < numSparseHandles; ++i) { - handles[i] = NULL; - THCusparseCheck(cusparseCreate(&handles[i])); - } - res->sparseHandles = handles; - res->numSparseHandles = numSparseHandles; +// size_t size = numSparseHandles * sizeof(cusparseHandle_t); +// cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size); +// for (int i = res->numSparseHandles; i < numSparseHandles; ++i) { +// handles[i] = NULL; +// THCusparseCheck(cusparseCreate(&handles[i])); +// } +// res->sparseHandles = handles; +// res->numSparseHandles = numSparseHandles; THCudaCheck(cudaSetDevice(prevDev)); } @@ -419,16 +419,16 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han return res->blasHandles[handle - 1]; } -cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle) -{ - if (handle <= 0 || handle > state->numUserSparseHandles) { - THError("%d is not a valid handle, valid range is: (1, %d)", - handle, state->numUserSparseHandles); - } - THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); - THCState_reserveDeviceSparseHandles(state, device, handle); - return res->sparseHandles[handle - 1]; -} +//cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle) +//{ +// if (handle <= 0 || handle > state->numUserSparseHandles) { +// THError("%d is not a valid handle, valid range is: (1, %d)", +// handle, state->numUserSparseHandles); +// } +// THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); +// THCState_reserveDeviceSparseHandles(state, device, handle); +// return res->sparseHandles[handle - 1]; +//} static THCStream* THCState_getStreamOnDevice(THCState* state, int device) { @@ -493,21 +493,21 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) return NULL; } -cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) -{ - /* This is called at the point of kernel execution. - For some debugging code or improperly instrumented kernels, - `state` is null */ - if (state) { - int device; - THCudaCheck(cudaGetDevice(&device)); - - int handle = THCState_getCurrentSparseHandleIndex(state); - return THCState_getDeviceSparseHandle(state, device, handle); - } - THError("THCState and sparseHandles must be set as there is no default sparseHandle"); - return NULL; -} +//cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) +//{ +// /* This is called at the point of kernel execution. +// For some debugging code or improperly instrumented kernels, +// `state` is null */ +// if (state) { +// int device; +// THCudaCheck(cudaGetDevice(&device)); +// +// int handle = THCState_getCurrentSparseHandleIndex(state); +// return THCState_getDeviceSparseHandle(state, device, handle); +// } +// THError("THCState and sparseHandles must be set as there is no default sparseHandle"); +// return NULL; +//} int THCState_getCurrentBlasHandleIndex(THCState *state) { @@ -643,54 +643,54 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line) } } -void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line) -{ - if(status != CUSPARSE_STATUS_SUCCESS) - { - const char* errmsg = NULL; - - switch(status) - { - case CUSPARSE_STATUS_NOT_INITIALIZED: - errmsg = "library not initialized"; - break; - - case CUSPARSE_STATUS_ALLOC_FAILED: - errmsg = "resource allocation failed"; - break; - - case CUSPARSE_STATUS_INVALID_VALUE: - errmsg = "an invalid numeric value was used as an argument"; - break; - - case CUSPARSE_STATUS_ARCH_MISMATCH: - errmsg = "an absent device architectural feature is required"; - break; - - case CUSPARSE_STATUS_MAPPING_ERROR: - errmsg = "an access to GPU memory space failed"; - break; - - case CUSPARSE_STATUS_EXECUTION_FAILED: - errmsg = "the GPU program failed to execute"; - break; - - case CUSPARSE_STATUS_INTERNAL_ERROR: - errmsg = "an internal operation failed"; - break; - - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - errmsg = "the matrix type is not supported by this function"; - break; - - default: - errmsg = "unknown error"; - break; - } - - _THError(file, line, "cusparse runtime error : %s", errmsg); - } -} +//void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line) +//{ +// if(status != CUSPARSE_STATUS_SUCCESS) +// { +// const char* errmsg = NULL; +// +// switch(status) +// { +// case CUSPARSE_STATUS_NOT_INITIALIZED: +// errmsg = "library not initialized"; +// break; +// +// case CUSPARSE_STATUS_ALLOC_FAILED: +// errmsg = "resource allocation failed"; +// break; +// +// case CUSPARSE_STATUS_INVALID_VALUE: +// errmsg = "an invalid numeric value was used as an argument"; +// break; +// +// case CUSPARSE_STATUS_ARCH_MISMATCH: +// errmsg = "an absent device architectural feature is required"; +// break; +// +// case CUSPARSE_STATUS_MAPPING_ERROR: +// errmsg = "an access to GPU memory space failed"; +// break; +// +// case CUSPARSE_STATUS_EXECUTION_FAILED: +// errmsg = "the GPU program failed to execute"; +// break; +// +// case CUSPARSE_STATUS_INTERNAL_ERROR: +// errmsg = "an internal operation failed"; +// break; +// +// case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: +// errmsg = "the matrix type is not supported by this function"; +// break; +// +// default: +// errmsg = "unknown error"; +// break; +// } +// +// _THError(file, line, "cusparse runtime error : %s", errmsg); +// } +//} void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data ) { diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in index 1b4e115a1fab4e..f58cbf56845c4d 100644 --- a/aten/src/THC/THCGeneral.h.in +++ b/aten/src/THC/THCGeneral.h.in @@ -12,7 +12,7 @@ #include "cuda.h" #include "cuda_runtime.h" #include "cublas_v2.h" -#include "cusparse.h" +//#include "cusparse.h" #cmakedefine USE_MAGMA @@ -65,7 +65,7 @@ typedef struct _THCCudaResourcesPerDevice { /* cuBLAS handes are lazily initialized */ cublasHandle_t* blasHandles; /* cuSparse handes are lazily initialized */ - cusparseHandle_t* sparseHandles; +// cusparseHandle_t* sparseHandles; /* Size of scratch space per each stream on this device available */ size_t scratchSpacePerStream; } THCCudaResourcesPerDevice; @@ -171,8 +171,8 @@ THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state); THC_API int THCState_getCurrentBlasHandleIndex(THCState *state); THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle); -THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle); -THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state); +//THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle); +//THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state); THC_API int THCState_getCurrentSparseHandleIndex(THCState *state); THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle); @@ -184,12 +184,12 @@ THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device); #define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__) #define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__) #define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__) -#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__) +//#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__) THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line); THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line); THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line); -THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line); +//THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line); THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size); THC_API cudaError_t THCudaFree(THCState *state, void *ptr); diff --git a/aten/src/THC/THCGenerator.hpp b/aten/src/THC/THCGenerator.hpp index ea5d1ba347d0f9..e19fa6055b8d0c 100644 --- a/aten/src/THC/THCGenerator.hpp +++ b/aten/src/THC/THCGenerator.hpp @@ -7,8 +7,8 @@ #include typedef struct THCGeneratorState { - struct curandStateMtgp32* gen_states; - struct mtgp32_kernel_params *kernel_params; +// struct curandStateMtgp32* gen_states; +// struct mtgp32_kernel_params *kernel_params; int initf; uint64_t initial_seed; std::atomic philox_seed_offset; diff --git a/aten/src/THC/THCTensorRandom.cpp b/aten/src/THC/THCTensorRandom.cpp index 703871bd54a7ba..c8cd91e8377cce 100644 --- a/aten/src/THC/THCTensorRandom.cpp +++ b/aten/src/THC/THCTensorRandom.cpp @@ -2,7 +2,7 @@ #include "THCGenerator.hpp" #include -#include +//#include void initializeGenerator(THCState *state, THCGenerator* gen); @@ -13,16 +13,16 @@ void createGeneratorState(THCGenerator* gen, uint64_t seed); void destroyGenerator(THCState *state, THCGenerator* gen) { std::lock_guard lock(gen->mutex); - if (gen->state.gen_states) - { - THCudaCheck(THCudaFree(state, gen->state.gen_states)); - gen->state.gen_states = NULL; - } - if (gen->state.kernel_params) - { - THCudaCheck(THCudaFree(state, gen->state.kernel_params)); - gen->state.kernel_params = NULL; - } +// if (gen->state.gen_states) +// { +// THCudaCheck(THCudaFree(state, gen->state.gen_states)); +// gen->state.gen_states = NULL; +// } +// if (gen->state.kernel_params) +// { +// THCudaCheck(THCudaFree(state, gen->state.kernel_params)); +// gen->state.kernel_params = NULL; +// } } static uint64_t createSeed(std::random_device& rd) @@ -45,8 +45,8 @@ void THCRandom_init(THCState* state, int devices, int current_device) rng_state->gen[i].state.initf = 0; rng_state->gen[i].state.initial_seed = createSeed(rd); rng_state->gen[i].state.philox_seed_offset = 0; - rng_state->gen[i].state.gen_states = NULL; - rng_state->gen[i].state.kernel_params = NULL; +// rng_state->gen[i].state.gen_states = NULL; +// rng_state->gen[i].state.kernel_params = NULL; } } @@ -87,11 +87,11 @@ THCGenerator* THCRandom_getGenerator(THCState* state) return gen; } -struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state) -{ - THCGenerator* gen = THCRandom_getGenerator(state); - return gen->state.gen_states; -} +//struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state) +//{ +// THCGenerator* gen = THCRandom_getGenerator(state); +// return gen->state.gen_states; +//} /* Random seed */ uint64_t THCRandom_seed(THCState* state) diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index d05af2dbf5996e..86b211d0266d89 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -8,10 +8,10 @@ #include "THCGenerator.hpp" #include -#include -#include -#include -#include +//#include +//#include +//#include +//#include #define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 @@ -22,22 +22,22 @@ THCGenerator* THCRandom_getGenerator(THCState* state); /* Sets up generator. Allocates but does not create the generator states. Not thread-safe. */ __host__ void initializeGenerator(THCState *state, THCGenerator* gen) { - THCudaCheck(THCudaMalloc(state, (void**)&gen->state.gen_states, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32))); - THCudaCheck(THCudaMalloc(state, (void**)&gen->state.kernel_params, sizeof(mtgp32_kernel_params))); +// THCudaCheck(THCudaMalloc(state, (void**)&gen->state.gen_states, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32))); +// THCudaCheck(THCudaMalloc(state, (void**)&gen->state.kernel_params, sizeof(mtgp32_kernel_params))); } /* Creates a new generator state given the seed. Not thread-safe. */ __host__ void createGeneratorState(THCGenerator* gen, uint64_t seed) { - if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS) - { - THError("Creating MTGP constants failed."); - } - if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213, - gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS) - { - THError("Creating MTGP kernel state failed."); - } +// if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS) +// { +// THError("Creating MTGP constants failed."); +// } +// if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213, +// gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS) +// { +// THError("Creating MTGP kernel state failed."); +// } } __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) @@ -46,30 +46,32 @@ __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) std::lock_guard lock(gen->mutex); // The RNG state comprises the MTPG32 states, the seed, and an offset used for Philox - static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); +// static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + static const size_t states_size = MAX_NUM_BLOCKS; static const size_t seed_size = sizeof(gen->state.initial_seed); static const size_t offset_size = sizeof(gen->state.philox_seed_offset); static const size_t total_size = states_size + seed_size + offset_size; THByteTensor_resize1d(rng_state, total_size); THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states, - states_size, cudaMemcpyDeviceToHost)); +// THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states, +// states_size, cudaMemcpyDeviceToHost)); memcpy(THByteTensor_data(rng_state) + states_size, &gen->state.initial_seed, seed_size); memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &gen->state.philox_seed_offset, offset_size); } -__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel) -{ - state[threadIdx.x].k = kernel; -} +//__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel) +//{ +// state[threadIdx.x].k = kernel; +//} __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state) { THCGenerator* gen = THCRandom_getGenerator(state); std::lock_guard lock(gen->mutex); - static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); +// static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + static const size_t states_size = MAX_NUM_BLOCKS; static const size_t seed_size = sizeof(gen->state.initial_seed); static const size_t offset_size = sizeof(gen->state.philox_seed_offset); static const size_t total_size = states_size + seed_size + offset_size; @@ -82,10 +84,10 @@ __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state) } THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state), - states_size, cudaMemcpyHostToDevice)); - set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>( - gen->state.gen_states, gen->state.kernel_params); +// THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state), +// states_size, cudaMemcpyHostToDevice)); +// set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>( +// gen->state.gen_states, gen->state.kernel_params); memcpy(&gen->state.initial_seed, THByteTensor_data(rng_state) + states_size, seed_size); if (!no_philox_seed) { memcpy(&gen->state.philox_seed_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size); @@ -121,7 +123,8 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { \ - CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); \ +/* CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); */ \ + CURAND_T x = (CURAND_T) 0.0; \ if (i < size) { \ T y = TRANSFORM; \ result[i] = y; \ @@ -135,7 +138,8 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2) int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { \ - CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); \ +/* CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); */ \ + CURAND_T x = (CURAND_T) 0.0; \ if (i < size) { \ T y = TRANSFORM; \ result[i] = y; \ @@ -149,24 +153,24 @@ struct is_same { static const bool value = false; }; template struct is_same { static const bool value = true; }; -template -__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size, - real *result, prob_type *probs) -{ - int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; - for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { - if (is_same::value) { - double x = curand_uniform_double(&state[blockIdx.x]); - if (i < size) - result[i] = ScalarConvert::to(x <= probs[i]); - } else { - float x = curand_uniform(&state[blockIdx.x]); - if (i < size) - result[i] = ScalarConvert::to(x <= probs[i]); - } - } -} +//template +//__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size, +// real *result, prob_type *probs) +//{ +// int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; +// int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; +// for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { +// if (is_same::value) { +// double x = curand_uniform_double(&state[blockIdx.x]); +// if (i < size) +// result[i] = ScalarConvert::to(x <= probs[i]); +// } else { +// float x = curand_uniform(&state[blockIdx.x]); +// if (i < size) +// result[i] = ScalarConvert::to(x <= probs[i]); +// } +// } +//} // NOTE: curand_uniform is (0, 1] and we want [a, b) GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh index 7749f231c5c771..25ea3f340e8db2 100644 --- a/aten/src/THC/THCTensorRandom.cuh +++ b/aten/src/THC/THCTensorRandom.cuh @@ -5,37 +5,37 @@ #include "THCReduceApplyUtils.cuh" #include "THCTensorMathReduce.cuh" -#include +//#include #define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 /* Separate kernel because curand_log_normal gets extra parameters. */ -template -__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev) -{ - int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; - for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { - float x = curand_log_normal(&state[blockIdx.x], mean, stddev); - if (i < size) { - result[i] = ScalarConvert::to(x); - } - } -} - -template <> -__global__ void generateLogNormal(curandStateMtgp32 *state, int size, double *result, double mean, double stddev) -{ - int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; - for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { - double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev); - if (i < size) { - result[i] = x; - } - } -} +//template +//__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev) +//{ +// int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; +// int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; +// for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { +// float x = curand_log_normal(&state[blockIdx.x], mean, stddev); +// if (i < size) { +// result[i] = ScalarConvert::to(x); +// } +// } +//} +// +//template <> +//__global__ void generateLogNormal(curandStateMtgp32 *state, int size, double *result, double mean, double stddev) +//{ +// int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; +// int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; +// for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { +// double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev); +// if (i < size) { +// result[i] = x; +// } +// } +//} template __global__ void @@ -289,89 +289,89 @@ sampleMultinomialOnce(int64_t* dest, } } -template -__global__ void -sampleMultinomialWithReplacement(curandStateMtgp32* state, - int totalSamples, - int64_t* dest, - int64_t distributions, - int categories, - T* normDistPrefixSum) { - // At the moment, each warp computes one sample value in the binary - // search due to divergence. It seems possible to compute multiple - // values and limit divergence though later on. However, no matter - // what, all block threads must participate in the curand_uniform - // call to update the generator state. - - // The block determines the distribution for which we generate a point - for (int64_t curDist = blockIdx.x; - curDist < distributions; - curDist += gridDim.x) { - for (int sampleBase = 0; - sampleBase < totalSamples; sampleBase += blockDim.y) { - // The warp determines the sample - int sample = sampleBase + threadIdx.y; - - // All threads participate in this - T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); - - if (threadIdx.x == 0 && sample < totalSamples) { - // Find the bucket that a uniform sample lies in - int choice = binarySearchForMultinomial( - normDistPrefixSum + curDist * categories, - categories, - r); - - // Torch indices are 1-based - dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; - } - } - } -} - -template -__global__ void -sampleMultinomialWithoutReplacement(curandStateMtgp32* state, - int totalSamples, - int sample, - int64_t* dest, - int64_t distributions, - int categories, - T* origDist, - T* normDistPrefixSum) { - // At the moment, each warp computes one sample value in the binary - // search due to divergence. It seems possible to compute multiple - // values and limit divergence though later on. However, no matter - // what, all block threads must participate in the curand_uniform - // call to update the generator state. - - // The block and warp determines the distribution for which we - // generate a point - for (int64_t curDistBase = blockIdx.x * blockDim.y; - curDistBase < distributions; - curDistBase += gridDim.x * blockDim.y) { - // The warp determines the distribution - int64_t curDist = curDistBase + threadIdx.y; - - // All threads must participate in this - T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); - - if (threadIdx.x == 0 && curDist < distributions) { - // Find the bucket that a uniform sample lies in - int choice = binarySearchForMultinomial( - normDistPrefixSum + curDist * categories, - categories, - r); - - // Torch indices are 1-based - dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; - - // Without replacement, so update the original probability so it - // is not considered a second time - origDist[curDist * categories + choice] = ScalarConvert::to(0); - } - } -} +//template +//__global__ void +//sampleMultinomialWithReplacement(curandStateMtgp32* state, +// int totalSamples, +// int64_t* dest, +// int64_t distributions, +// int categories, +// T* normDistPrefixSum) { +// // At the moment, each warp computes one sample value in the binary +// // search due to divergence. It seems possible to compute multiple +// // values and limit divergence though later on. However, no matter +// // what, all block threads must participate in the curand_uniform +// // call to update the generator state. +// +// // The block determines the distribution for which we generate a point +// for (int64_t curDist = blockIdx.x; +// curDist < distributions; +// curDist += gridDim.x) { +// for (int sampleBase = 0; +// sampleBase < totalSamples; sampleBase += blockDim.y) { +// // The warp determines the sample +// int sample = sampleBase + threadIdx.y; +// +// // All threads participate in this +// T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); +// +// if (threadIdx.x == 0 && sample < totalSamples) { +// // Find the bucket that a uniform sample lies in +// int choice = binarySearchForMultinomial( +// normDistPrefixSum + curDist * categories, +// categories, +// r); +// +// // Torch indices are 1-based +// dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; +// } +// } +// } +//} +// +//template +//__global__ void +//sampleMultinomialWithoutReplacement(curandStateMtgp32* state, +// int totalSamples, +// int sample, +// int64_t* dest, +// int64_t distributions, +// int categories, +// T* origDist, +// T* normDistPrefixSum) { +// // At the moment, each warp computes one sample value in the binary +// // search due to divergence. It seems possible to compute multiple +// // values and limit divergence though later on. However, no matter +// // what, all block threads must participate in the curand_uniform +// // call to update the generator state. +// +// // The block and warp determines the distribution for which we +// // generate a point +// for (int64_t curDistBase = blockIdx.x * blockDim.y; +// curDistBase < distributions; +// curDistBase += gridDim.x * blockDim.y) { +// // The warp determines the distribution +// int64_t curDist = curDistBase + threadIdx.y; +// +// // All threads must participate in this +// T r = ScalarConvert::to(curand_uniform(&state[blockIdx.x])); +// +// if (threadIdx.x == 0 && curDist < distributions) { +// // Find the bucket that a uniform sample lies in +// int choice = binarySearchForMultinomial( +// normDistPrefixSum + curDist * categories, +// categories, +// r); +// +// // Torch indices are 1-based +// dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE; +// +// // Without replacement, so update the original probability so it +// // is not considered a second time +// origDist[curDist * categories + choice] = ScalarConvert::to(0); +// } +// } +//} template __global__ void diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu index ef6ae9191585c8..c2d7d35df59f4d 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -15,8 +15,8 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_uniform<<>>( - gen->state.gen_states, size, data, a, b); +// generate_uniform<<>>( +// gen->state.gen_states, size, data, a, b); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -30,8 +30,8 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_normal<<>>( - gen->state.gen_states, size, data, mean, stdv); +// generate_normal<<>>( +// gen->state.gen_states, size, data, mean, stdv); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -69,8 +69,8 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generateLogNormal<<>>( - gen->state.gen_states, size, data, mean, stdv); +// generateLogNormal<<>>( +// gen->state.gen_states, size, data, mean, stdv); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -85,8 +85,8 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_exponential<<>>( - gen->state.gen_states, size, data, lambda); +// generate_exponential<<>>( +// gen->state.gen_states, size, data, lambda); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -101,8 +101,8 @@ THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_cauchy<<>>( - gen->state.gen_states, size, data, median, sigma); +// generate_cauchy<<>>( +// gen->state.gen_states, size, data, median, sigma); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -240,13 +240,13 @@ THC_API void THCTensor_(multinomial)(struct THCState *state, // distribution concurrently. dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS); - sampleMultinomialWithReplacement - <<>>( - gen->state.gen_states, - n_sample, - THCudaLongTensor_data(state, self), - numDist, numCategories, - THCTensor_(data)(state, prefixSum)); +// sampleMultinomialWithReplacement +// <<>>( +// gen->state.gen_states, +// n_sample, +// THCudaLongTensor_data(state, self), +// numDist, numCategories, +// THCTensor_(data)(state, prefixSum)); } else { // Sample without replacement @@ -273,15 +273,15 @@ THC_API void THCTensor_(multinomial)(struct THCState *state, // The kernel can only draw one sample before we have to // recalculate our distribution - sampleMultinomialWithoutReplacement - <<>>( - gen->state.gen_states, - n_sample, - sample, - THCudaLongTensor_data(state, self), - numDist, numCategories, - THCTensor_(data)(state, origDist), - THCTensor_(data)(state, prefixSum)); +// sampleMultinomialWithoutReplacement +// <<>>( +// gen->state.gen_states, +// n_sample, +// sample, +// THCudaLongTensor_data(state, self), +// numDist, numCategories, +// THCTensor_(data)(state, origDist), +// THCTensor_(data)(state, prefixSum)); } } @@ -397,8 +397,8 @@ THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p) THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_bernoulli<<>>( - gen->state.gen_states, size, data, p); +// generate_bernoulli<<>>( +// gen->state.gen_states, size, data, p); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -428,8 +428,8 @@ THC_API void THCTensor_(NAME)(THCState* state, \ \ THArgCheck(size == prob_size, 3, "inconsistent tensor size"); \ \ - generate_bernoulli_tensor<<>>( \ - gen->state.gen_states, size, result_data, probs_data); \ +/* generate_bernoulli_tensor<<>>( */ \ +/* gen->state.gen_states, size, result_data, probs_data); */ \ \ PROB_TYPE##_free(state, probs); \ THCTensor_(freeCopyTo)(state, self, self_); \ @@ -468,8 +468,8 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p) THCTensor *self = THCTensor_(newContiguous)(state, self_); real *data = THCTensor_(data)(state, self); - generate_geometric<<>>( - gen->state.gen_states, size, data, p); +// generate_geometric<<>>( +// gen->state.gen_states, size, data, p); THCTensor_(freeCopyTo)(state, self, self_); }; @@ -489,12 +489,12 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_ #if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT) if (range > 1ULL << 32) { - generate_random_64<<>>( - gen->state.gen_states, size, data, min_val, range); +// generate_random_64<<>>( +// gen->state.gen_states, size, data, min_val, range); } else { #endif - generate_random<<>>( - gen->state.gen_states, size, data, min_val, range); +// generate_random<<>>( +// gen->state.gen_states, size, data, min_val, range); #if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT) } #endif @@ -519,20 +519,20 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_) real *data = THCTensor_(data)(state, self); #if defined(THC_REAL_IS_HALF) - generate_random<<>>( - gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1); +// generate_random<<>>( +// gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1); #elif defined(THC_REAL_IS_FLOAT) - generate_random<<>>( - gen->state.gen_states, size, data, 0UL, (1UL << FLT_MANT_DIG) + 1); +// generate_random<<>>( +// gen->state.gen_states, size, data, 0UL, (1UL << FLT_MANT_DIG) + 1); #elif defined(THC_REAL_IS_DOUBLE) - generate_random_64<<>>( - gen->state.gen_states, size, data, 0ULL, (1ULL << DBL_MANT_DIG) + 1); +// generate_random_64<<>>( +// gen->state.gen_states, size, data, 0ULL, (1ULL << DBL_MANT_DIG) + 1); #elif defined(THC_REAL_IS_LONG) - generate_random_64<<>>( - gen->state.gen_states, size, data, 0ULL, static_cast(std::numeric_limits::max()) + 1); +// generate_random_64<<>>( +// gen->state.gen_states, size, data, 0ULL, static_cast(std::numeric_limits::max()) + 1); #else - generate_random<<>>( - gen->state.gen_states, size, data, 0UL, static_cast(std::numeric_limits::max()) + 1); +// generate_random<<>>( +// gen->state.gen_states, size, data, 0UL, static_cast(std::numeric_limits::max()) + 1); #endif THCTensor_(freeCopyTo)(state, self, self_); diff --git a/aten/src/THCS/THCSparse.cu b/aten/src/THCS/THCSparse.cu index 90b9bc50edcd7c..fae3835fd587bf 100644 --- a/aten/src/THCS/THCSparse.cu +++ b/aten/src/THCS/THCSparse.cu @@ -11,15 +11,17 @@ void THCudaSparse_Xcoo2csr(THCState *state, const int *coorowind, int64_t nnz, i )); } -cusparseOperation_t convertTransToCusparseOperation(char trans) { - if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE; - else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE; - else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; - else { - THError("trans must be one of: t, n, c"); - return CUSPARSE_OPERATION_TRANSPOSE; - } -} +/* +//cusparseOperation_t convertTransToCusparseOperation(char trans) { +// if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE; +// else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE; +// else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; +// else { +// THError("trans must be one of: t, n, c"); +// return CUSPARSE_OPERATION_TRANSPOSE; +// } +//} +*/ void adjustLd(char transb, int64_t m, int64_t n, int64_t k, int64_t *ldb, int64_t *ldc) { diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu index bf4503515c4342..a84e0d70e4b797 100644 --- a/aten/src/THCUNN/RReLU.cu +++ b/aten/src/THCUNN/RReLU.cu @@ -3,54 +3,54 @@ #include "THCHalfAutoNumerics.cuh" #include #include "common.h" -#include -#include +//#include +//#include // copied from cutorch/lib/THC/THCTensorRandom.cu #define MAX_NUM_BLOCKS 64 #define BLOCK_SIZE 256 #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS) -template -inline T __device__ curand_uniform_type(curandStateMtgp32 *state); - -#ifdef CUDA_HALF_TENSOR -template <> -inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { - return ScalarConvert::to(curand_uniform(state)); -} -#endif - -template <> -inline float __device__ curand_uniform_type(curandStateMtgp32 *state) { - return curand_uniform(state); -} - -template <> -inline double __device__ curand_uniform_type(curandStateMtgp32 *state) { - return curand_uniform_double(state); -} - -template -__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state, - T *input, T* noise, T *output, double a, double b) -{ - CUDA_KERNEL_LOOP(i, n) - { - if (input[i] <= 0) - { - T r = curand_uniform_type(&state[blockIdx.x]); - r = ScalarConvert::to(r * (b-a) + a); - output[i] = input[i] * r; - noise[i] = r; - } - else - { - output[i] = input[i]; - noise[i] = ScalarConvert::to(1); - } - } -} +//template +//inline T __device__ curand_uniform_type(curandStateMtgp32 *state); +// +//#ifdef CUDA_HALF_TENSOR +//template <> +//inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { +// return ScalarConvert::to(curand_uniform(state)); +//} +//#endif +// +//template <> +//inline float __device__ curand_uniform_type(curandStateMtgp32 *state) { +// return curand_uniform(state); +//} +// +//template <> +//inline double __device__ curand_uniform_type(curandStateMtgp32 *state) { +// return curand_uniform_double(state); +//} +// +//template +//__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state, +// T *input, T* noise, T *output, double a, double b) +//{ +// CUDA_KERNEL_LOOP(i, n) +// { +// if (input[i] <= 0) +// { +// T r = curand_uniform_type(&state[blockIdx.x]); +// r = ScalarConvert::to(r * (b-a) + a); +// output[i] = input[i] * r; +// noise[i] = r; +// } +// else +// { +// output[i] = input[i]; +// noise[i] = ScalarConvert::to(1); +// } +// } +//} template struct RReLUUpdateOutputEval_functor diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu index cd9b6590851c1d..90f626a0bf2e0a 100644 --- a/aten/src/THCUNN/SparseLinear.cu +++ b/aten/src/THCUNN/SparseLinear.cu @@ -3,17 +3,17 @@ #include "THCHalfAutoNumerics.cuh" #include "THCTensor.hpp" -#include +//#include -static cusparseHandle_t cusparse_handle = 0; +//static cusparseHandle_t cusparse_handle = 0; static void init_cusparse() { - if (cusparse_handle == 0) { - cusparseStatus_t status = cusparseCreate(&cusparse_handle); - if (status != CUSPARSE_STATUS_SUCCESS) { - THError("CUSPARSE Library initialization failed"); - } - } +// if (cusparse_handle == 0) { +// cusparseStatus_t status = cusparseCreate(&cusparse_handle); +// if (status != CUSPARSE_STATUS_SUCCESS) { +// THError("CUSPARSE Library initialization failed"); +// } +// } } #ifdef CUDA_HALF_TENSOR diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 462120c014271c..b755196dc98cd8 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -496,8 +496,8 @@ endif() if(USE_ROCM AND NOT BUILD_CAFFE2) include_directories(SYSTEM ${HIP_PATH}/include) include_directories(SYSTEM ${HIPBLAS_PATH}/include) - include_directories(SYSTEM ${HIPSPARSE_PATH}/include) - include_directories(SYSTEM ${HIPRNG_PATH}/include) +# include_directories(SYSTEM ${HIPSPARSE_PATH}/include) +# include_directories(SYSTEM ${HIPRNG_PATH}/include) include_directories(SYSTEM ${THRUST_PATH}) # load HIP cmake module and load platform id diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index 559b2e23855c95..80b751b03021db 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -46,18 +46,18 @@ ELSE() ENDIF() # HIPRNG_PATH -IF(NOT DEFINED ENV{HIPRNG_PATH}) - SET(HIPRNG_PATH ${ROCM_PATH}/hcrng) -ELSE() - SET(HIPRNG_PATH $ENV{HIPRNG_PATH}) -ENDIF() +#IF(NOT DEFINED ENV{HIPRNG_PATH}) +# SET(HIPRNG_PATH ${ROCM_PATH}/hcrng) +#ELSE() +# SET(HIPRNG_PATH $ENV{HIPRNG_PATH}) +#ENDIF() # HIPSPARSE_PATH -IF(NOT DEFINED ENV{HIPSPARSE_PATH}) - SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse) -ELSE() - SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH}) -ENDIF() +#IF(NOT DEFINED ENV{HIPSPARSE_PATH}) +# SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse) +#ELSE() +# SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH}) +#ENDIF() # THRUST_PATH IF(DEFINED ENV{THRUST_PATH}) @@ -114,7 +114,7 @@ IF(HIP_FOUND) set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas) - set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) + #set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) find_package(rocrand REQUIRED) find_package(hiprand REQUIRED) @@ -132,9 +132,9 @@ IF(HIP_FOUND) # however currently it's just the lib name FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib) FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib) - FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib) +# FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib) FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib) - FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib) +# FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib) # Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers. diff --git a/setup.py b/setup.py index ae99e2b94f1c1f..3c4bb7ef7be31b 100644 --- a/setup.py +++ b/setup.py @@ -896,13 +896,13 @@ def run(self): rocm_include_path = '/opt/rocm/include' hcc_include_path = '/opt/rocm/hcc/include' hipblas_include_path = '/opt/rocm/hipblas/include' - hipsparse_include_path = '/opt/rocm/hcsparse/include' + #hipsparse_include_path = '/opt/rocm/hcsparse/include' hip_lib_path = '/opt/rocm/hip/lib' hcc_lib_path = '/opt/rocm/hcc/lib' include_dirs.append(rocm_include_path) include_dirs.append(hcc_include_path) include_dirs.append(hipblas_include_path) - include_dirs.append(hipsparse_include_path) + #include_dirs.append(hipsparse_include_path) include_dirs.append(tmp_install_path + "/include/THCUNN") extra_link_args.append('-L' + hip_lib_path) extra_link_args.append('-Wl,-rpath,' + hip_lib_path) diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index 52a04acbe5d4ba..53113900b4dc74 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -50,7 +50,7 @@ if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False): continue # Update contents. - with open(source, "r+") as f: + with open(source, "r+", encoding="utf-8") as f: contents = f.read() contents = contents.replace("USE_CUDA", "USE_ROCM") contents = contents.replace("CUDA_VERSION", "0") diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index c3d993cfbb35cd..9d121fe50c9f51 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -118,7 +118,7 @@ PyObject * THCPModule_getRNGState(PyObject *_unused) using namespace torch::autograd; HANDLE_TH_ERRORS auto tensor = VariableType::getType(CPU(kByte))->tensor(); - THCRandom_getRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false)); +// THCRandom_getRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false)); return THPVariable_Wrap(tensor); END_HANDLE_TH_ERRORS } @@ -131,7 +131,7 @@ PyObject * THCPModule_setRNGState(PyObject *_unused, PyObject *obj) Py_TYPE(obj)->tp_name); } auto& tensor = THPVariable_UnpackData(obj); - THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false)); +// THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false)); Py_RETURN_NONE; END_HANDLE_TH_ERRORS }