diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 76c58b7db86bcd..31a23f5f172a1b 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -10,7 +10,7 @@
 
 #if CUDNN_VERSION < 7000
 
-#include <curand_kernel.h>
+//#include <curand_kernel.h>
 
 /*
 Note [cuDNN dropout descriptor initialization]
@@ -233,7 +233,8 @@ inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
   if (ret != CUDNN_STATUS_SUCCESS) return ret;
   if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
   dropoutDesc->dropout = dropout;
-  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+//  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+  dropoutDesc->nstates = (int)stateSizeInBytes;
   dropoutDesc->states = states;
   return CUDNN_STATUS_SUCCESS;
 }
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index c591a30a8aa0dd..155b652c10f73c 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -3,9 +3,9 @@
 #include "ATen/cuda/CUDAApplyUtils.cuh"
 #include "ATen/AccumulateType.h"
 
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
+//#include <curand.h>
+//#include <curand_kernel.h>
+//#include <curand_philox4x32_x.h>
 #include <utility>
 #include <functional>
 #include <nvfunctional>
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 96f648c681ab2f..9760b0fb3a1db2 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -180,7 +180,7 @@ __global__ void renorm_kernel(
     } else if (norm_type == 2) {
       v += x * x;
     } else {
-      v += std::pow(x, norm_type);
+      //v += std::pow(x, norm_type);
     }
   }
 
@@ -188,7 +188,7 @@ __global__ void renorm_kernel(
   v = reduceBlock<accscalar_t>(sdata, blockDim.x, v, Op(), 0);
 
   if (tid == 0) {
-    sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
+    //sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
   }
   __syncthreads();
 
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 31fe612f98c37b..e112ff6d56aa5f 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -46,17 +46,17 @@ void magmaGesvBatched<double>(
       dB_array, lddb, dinfo_array, batch_count, queue);
 }
 
-static magma_queue_t createMagmaQueue(const Tensor& tensor) {
-  auto& context = tensor.type().get_context();
-  magma_queue_t magma_queue;
-  magma_queue_create_from_cuda(
-      tensor.get_device(),
-      context.getCurrentCUDAStream(),
-      THCState_getCurrentBlasHandle(context.getTHCState()),
-      THCState_getCurrentSparseHandle(context.getTHCState()),
-      &magma_queue);
-  return magma_queue;
-}
+//static magma_queue_t createMagmaQueue(const Tensor& tensor) {
+//  auto& context = tensor.type().get_context();
+//  magma_queue_t magma_queue;
+//  magma_queue_create_from_cuda(
+//      tensor.get_device(),
+//      context.getCurrentCUDAStream(),
+//      THCState_getCurrentBlasHandle(context.getTHCState()),
+//      THCState_getCurrentSparseHandle(context.getTHCState()),
+//      &magma_queue);
+//  return magma_queue;
+//}
 
 static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<magma_int_t>(value);
@@ -116,9 +116,9 @@ AT_ERROR("gesv: MAGMA library not found in "
     ipiv_array[i] = &ipiv_data[i * n];
   }
 
-  magmaGesvBatched<scalar_t>(
-      n, nrhs, A_array, n, ipiv_array, b_array, n,
-      info_array, batch_size, createMagmaQueue(b));
+//  magmaGesvBatched<scalar_t>(
+//      n, nrhs, A_array, n, ipiv_array, b_array, n,
+//      info_array, batch_size, createMagmaQueue(b));
 
   for (int64_t i = 0; i < batch_size; i++) {
     infos[i] = info_array[i];
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index d500a47320f333..22ef5f5831833c 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -18,7 +18,7 @@ namespace {
 template<typename T, typename AccumT>
 struct LogSoftMaxForwardEpilogue {
   __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
-    : logsum(max_input + std::log(sum)) {}
+    : logsum(max_input /*+ std::log(sum)*/ ) {}
 
   __device__ __forceinline__ T operator()(T input) const {
     return static_cast<T>(input - logsum);
@@ -33,7 +33,7 @@ struct LogSoftMaxBackwardEpilogue {
     : sum(sum) {}
 
   __device__ __forceinline__ T operator()(T gradOutput, T output) const {
-    return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
+    return static_cast<T>(gradOutput /*- std::exp(static_cast<AccumT>(output)) * sum */ );
   }
 
   const AccumT sum;
@@ -46,7 +46,7 @@ struct SoftMaxForwardEpilogue {
     , sum(sum) {}
 
   __device__ __forceinline__ T operator()(T input) const {
-    return static_cast<T>(std::exp(input - max_input) / sum);
+    return static_cast<T>(0); // std::exp(input - max_input) / sum);
   }
 
   const AccumT max_input;
@@ -203,9 +203,9 @@ __global__ void cunn_SpatialSoftMaxForward(
         max_input = spatialBlockReduceX<accscalar_t, Max>(sdata,max_input);
 
         accscalar_t sum = 0;
-        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
-                 - max_input);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {}
+          //sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+          //       - max_input);
         sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
 
         Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
@@ -218,9 +218,9 @@ __global__ void cunn_SpatialSoftMaxForward(
           max_input = Max<accscalar_t>()(max_input, value);
         }
         accscalar_t sum = 0;
-        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
-          sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
-                 - max_input);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {}
+          //sum += std::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+          //       - max_input);
         Epilogue<scalar_t, accscalar_t> epilogue(max_input, sum);
         for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
           output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
@@ -284,7 +284,7 @@ template <typename T, typename AccumT>
 struct MaxFloat
 {
   __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
-    return ::max(max, (AccumT)v);
+    return /*::max(max,*/ (AccumT)v /*)*/ ;
   }
 };
 
@@ -303,7 +303,7 @@ struct SumExpFloat
     : max_k(v) {}
 
   __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
-    return sum + std::exp(v - max_k);
+    return sum; // + std::exp(v - max_k);
   }
 
   const AccumT max_k;
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index 114b967f7d309f..34aeca1c23d49d 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -173,12 +173,12 @@ void THCudaShutdown(THCState* state)
       THCublasCheck(cublasDestroy(res->blasHandles[i]));
     }
     /* Free user defined sparse handles */
-    for (int i = 0; i < res->numSparseHandles; ++i) {
-      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
-    }
+//    for (int i = 0; i < res->numSparseHandles; ++i) {
+//      THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+//    }
 
     free(res->blasHandles);
-    free(res->sparseHandles);
+//    free(res->sparseHandles);
     THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
     THCThreadLocal_free(state->currentStreams[dev]);
   }
@@ -354,14 +354,14 @@ void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSpa
   THCudaCheck(cudaGetDevice(&prevDev));
   THCudaCheck(cudaSetDevice(device));
 
-  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
-  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
-  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
-    handles[i] = NULL;
-    THCusparseCheck(cusparseCreate(&handles[i]));
-  }
-  res->sparseHandles = handles;
-  res->numSparseHandles = numSparseHandles;
+//  size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+//  cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+//  for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+//    handles[i] = NULL;
+//    THCusparseCheck(cusparseCreate(&handles[i]));
+//  }
+//  res->sparseHandles = handles;
+//  res->numSparseHandles = numSparseHandles;
 
   THCudaCheck(cudaSetDevice(prevDev));
 }
@@ -419,16 +419,16 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han
   return res->blasHandles[handle - 1];
 }
 
-cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
-{
-  if (handle <= 0 || handle > state->numUserSparseHandles) {
-    THError("%d is not a valid handle, valid range is: (1, %d)",
-            handle, state->numUserSparseHandles);
-  }
-  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
-  THCState_reserveDeviceSparseHandles(state, device, handle);
-  return res->sparseHandles[handle - 1];
-}
+//cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+//{
+//  if (handle <= 0 || handle > state->numUserSparseHandles) {
+//    THError("%d is not a valid handle, valid range is: (1, %d)",
+//            handle, state->numUserSparseHandles);
+//  }
+//  THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+//  THCState_reserveDeviceSparseHandles(state, device, handle);
+//  return res->sparseHandles[handle - 1];
+//}
 
 static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
 {
@@ -493,21 +493,21 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
   return NULL;
 }
 
-cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
-{
-  /* This is called at the point of kernel execution.
-     For some debugging code or improperly instrumented kernels,
-     `state` is null */
-  if (state) {
-    int device;
-    THCudaCheck(cudaGetDevice(&device));
-
-    int handle = THCState_getCurrentSparseHandleIndex(state);
-    return THCState_getDeviceSparseHandle(state, device, handle);
-  }
-  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
-  return NULL;
-}
+//cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+//{
+//  /* This is called at the point of kernel execution.
+//     For some debugging code or improperly instrumented kernels,
+//     `state` is null */
+//  if (state) {
+//    int device;
+//    THCudaCheck(cudaGetDevice(&device));
+//
+//    int handle = THCState_getCurrentSparseHandleIndex(state);
+//    return THCState_getDeviceSparseHandle(state, device, handle);
+//  }
+//  THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+//  return NULL;
+//}
 
 int THCState_getCurrentBlasHandleIndex(THCState *state)
 {
@@ -643,54 +643,54 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
   }
 }
 
-void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
-{
-  if(status != CUSPARSE_STATUS_SUCCESS)
-  {
-    const char* errmsg = NULL;
-
-    switch(status)
-    {
-      case CUSPARSE_STATUS_NOT_INITIALIZED:
-        errmsg = "library not initialized";
-        break;
-
-      case CUSPARSE_STATUS_ALLOC_FAILED:
-        errmsg = "resource allocation failed";
-        break;
-
-      case CUSPARSE_STATUS_INVALID_VALUE:
-        errmsg = "an invalid numeric value was used as an argument";
-        break;
-
-      case CUSPARSE_STATUS_ARCH_MISMATCH:
-        errmsg = "an absent device architectural feature is required";
-        break;
-
-      case CUSPARSE_STATUS_MAPPING_ERROR:
-        errmsg = "an access to GPU memory space failed";
-        break;
-
-      case CUSPARSE_STATUS_EXECUTION_FAILED:
-        errmsg = "the GPU program failed to execute";
-        break;
-
-      case CUSPARSE_STATUS_INTERNAL_ERROR:
-        errmsg = "an internal operation failed";
-        break;
-
-      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-        errmsg = "the matrix type is not supported by this function";
-        break;
-
-      default:
-        errmsg = "unknown error";
-        break;
-    }
-
-    _THError(file, line, "cusparse runtime error : %s", errmsg);
-  }
-}
+//void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+//{
+//  if(status != CUSPARSE_STATUS_SUCCESS)
+//  {
+//    const char* errmsg = NULL;
+//
+//    switch(status)
+//    {
+//      case CUSPARSE_STATUS_NOT_INITIALIZED:
+//        errmsg = "library not initialized";
+//        break;
+//
+//      case CUSPARSE_STATUS_ALLOC_FAILED:
+//        errmsg = "resource allocation failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_INVALID_VALUE:
+//        errmsg = "an invalid numeric value was used as an argument";
+//        break;
+//
+//      case CUSPARSE_STATUS_ARCH_MISMATCH:
+//        errmsg = "an absent device architectural feature is required";
+//        break;
+//
+//      case CUSPARSE_STATUS_MAPPING_ERROR:
+//        errmsg = "an access to GPU memory space failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_EXECUTION_FAILED:
+//        errmsg = "the GPU program failed to execute";
+//        break;
+//
+//      case CUSPARSE_STATUS_INTERNAL_ERROR:
+//        errmsg = "an internal operation failed";
+//        break;
+//
+//      case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+//        errmsg = "the matrix type is not supported by this function";
+//        break;
+//
+//      default:
+//        errmsg = "unknown error";
+//        break;
+//    }
+//
+//    _THError(file, line, "cusparse runtime error : %s", errmsg);
+//  }
+//}
 
 void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
 {
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
index 1b4e115a1fab4e..f58cbf56845c4d 100644
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@@ -12,7 +12,7 @@
 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "cublas_v2.h"
-#include "cusparse.h"
+//#include "cusparse.h"
 
 #cmakedefine USE_MAGMA
 
@@ -65,7 +65,7 @@ typedef struct _THCCudaResourcesPerDevice {
   /* cuBLAS handes are lazily initialized */
   cublasHandle_t* blasHandles;
   /* cuSparse handes are lazily initialized */
-  cusparseHandle_t* sparseHandles;
+//  cusparseHandle_t* sparseHandles;
   /* Size of scratch space per each stream on this device available */
   size_t scratchSpacePerStream;
 } THCCudaResourcesPerDevice;
@@ -171,8 +171,8 @@ THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
 THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
 THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
 
-THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
-THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
+//THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
+//THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
 THC_API int THCState_getCurrentSparseHandleIndex(THCState *state);
 THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle);
 
@@ -184,12 +184,12 @@ THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
 #define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
 #define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
 #define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
-#define THCusparseCheck(err)  __THCusparseCheck(err,  __FILE__, __LINE__)
+//#define THCusparseCheck(err)  __THCusparseCheck(err,  __FILE__, __LINE__)
 
 THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
 THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
 THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
-THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
+//THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
 
 THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
 THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
diff --git a/aten/src/THC/THCGenerator.hpp b/aten/src/THC/THCGenerator.hpp
index ea5d1ba347d0f9..e19fa6055b8d0c 100644
--- a/aten/src/THC/THCGenerator.hpp
+++ b/aten/src/THC/THCGenerator.hpp
@@ -7,8 +7,8 @@
 #include <mutex>
 
 typedef struct THCGeneratorState {
-  struct curandStateMtgp32* gen_states;
-  struct mtgp32_kernel_params *kernel_params;
+//  struct curandStateMtgp32* gen_states;
+//  struct mtgp32_kernel_params *kernel_params;
   int initf;
   uint64_t initial_seed;
   std::atomic<int64_t> philox_seed_offset;
diff --git a/aten/src/THC/THCTensorRandom.cpp b/aten/src/THC/THCTensorRandom.cpp
index 703871bd54a7ba..c8cd91e8377cce 100644
--- a/aten/src/THC/THCTensorRandom.cpp
+++ b/aten/src/THC/THCTensorRandom.cpp
@@ -2,7 +2,7 @@
 #include "THCGenerator.hpp"
 
 #include <random>
-#include <curand.h>
+//#include <curand.h>
 
 
 void initializeGenerator(THCState *state, THCGenerator* gen);
@@ -13,16 +13,16 @@ void createGeneratorState(THCGenerator* gen, uint64_t seed);
 void destroyGenerator(THCState *state, THCGenerator* gen)
 {
   std::lock_guard<std::mutex> lock(gen->mutex);
-  if (gen->state.gen_states)
-  {
-    THCudaCheck(THCudaFree(state, gen->state.gen_states));
-    gen->state.gen_states = NULL;
-  }
-  if (gen->state.kernel_params)
-  {
-    THCudaCheck(THCudaFree(state, gen->state.kernel_params));
-    gen->state.kernel_params = NULL;
-  }
+//  if (gen->state.gen_states)
+//  {
+//    THCudaCheck(THCudaFree(state, gen->state.gen_states));
+//    gen->state.gen_states = NULL;
+//  }
+//  if (gen->state.kernel_params)
+//  {
+//    THCudaCheck(THCudaFree(state, gen->state.kernel_params));
+//    gen->state.kernel_params = NULL;
+//  }
 }
 
 static uint64_t createSeed(std::random_device& rd)
@@ -45,8 +45,8 @@ void THCRandom_init(THCState* state, int devices, int current_device)
     rng_state->gen[i].state.initf = 0;
     rng_state->gen[i].state.initial_seed = createSeed(rd);
     rng_state->gen[i].state.philox_seed_offset = 0;
-    rng_state->gen[i].state.gen_states = NULL;
-    rng_state->gen[i].state.kernel_params = NULL;
+//    rng_state->gen[i].state.gen_states = NULL;
+//    rng_state->gen[i].state.kernel_params = NULL;
   }
 }
 
@@ -87,11 +87,11 @@ THCGenerator* THCRandom_getGenerator(THCState* state)
   return gen;
 }
 
-struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state)
-{
-  THCGenerator* gen = THCRandom_getGenerator(state);
-  return gen->state.gen_states;
-}
+//struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state)
+//{
+//  THCGenerator* gen = THCRandom_getGenerator(state);
+//  return gen->state.gen_states;
+//}
 
 /* Random seed */
 uint64_t THCRandom_seed(THCState* state)
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index d05af2dbf5996e..86b211d0266d89 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -8,10 +8,10 @@
 #include "THCGenerator.hpp"
 
 #include <thrust/functional.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_mtgp32_host.h>
-#include <curand_mtgp32dc_p_11213.h>
+//#include <curand.h>
+//#include <curand_kernel.h>
+//#include <curand_mtgp32_host.h>
+//#include <curand_mtgp32dc_p_11213.h>
 
 #define MAX_NUM_BLOCKS 200 
 #define BLOCK_SIZE 256
@@ -22,22 +22,22 @@ THCGenerator* THCRandom_getGenerator(THCState* state);
 /* Sets up generator. Allocates but does not create the generator states. Not thread-safe. */
 __host__ void initializeGenerator(THCState *state, THCGenerator* gen)
 {
-  THCudaCheck(THCudaMalloc(state, (void**)&gen->state.gen_states, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32)));
-  THCudaCheck(THCudaMalloc(state, (void**)&gen->state.kernel_params, sizeof(mtgp32_kernel_params)));
+//  THCudaCheck(THCudaMalloc(state, (void**)&gen->state.gen_states, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32)));
+//  THCudaCheck(THCudaMalloc(state, (void**)&gen->state.kernel_params, sizeof(mtgp32_kernel_params)));
 }
 
 /* Creates a new generator state given the seed. Not thread-safe. */
 __host__ void createGeneratorState(THCGenerator* gen, uint64_t seed)
 {
-  if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS)
-  {
-    THError("Creating MTGP constants failed.");
-  }
-  if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213,
-                                  gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS)
-  {
-    THError("Creating MTGP kernel state failed.");
-  }
+//  if (curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, gen->state.kernel_params) != CURAND_STATUS_SUCCESS)
+//  {
+//    THError("Creating MTGP constants failed.");
+//  }
+//  if (curandMakeMTGP32KernelState(gen->state.gen_states, mtgp32dc_params_fast_11213,
+//                                  gen->state.kernel_params, MAX_NUM_BLOCKS, seed) != CURAND_STATUS_SUCCESS)
+//  {
+//    THError("Creating MTGP kernel state failed.");
+//  }
 }
 
 __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state)
@@ -46,30 +46,32 @@ __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state)
   std::lock_guard<std::mutex> lock(gen->mutex);
 
   // The RNG state comprises the MTPG32 states, the seed, and an offset used for Philox
-  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+//  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t states_size = MAX_NUM_BLOCKS;
   static const size_t seed_size = sizeof(gen->state.initial_seed);
   static const size_t offset_size = sizeof(gen->state.philox_seed_offset);
   static const size_t total_size = states_size + seed_size + offset_size;
   THByteTensor_resize1d(rng_state, total_size);
   THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
   THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
-  THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states,
-                         states_size, cudaMemcpyDeviceToHost));
+//  THCudaCheck(cudaMemcpy(THByteTensor_data(rng_state), gen->state.gen_states,
+//                         states_size, cudaMemcpyDeviceToHost));
   memcpy(THByteTensor_data(rng_state) + states_size, &gen->state.initial_seed, seed_size);
   memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &gen->state.philox_seed_offset, offset_size);
 }
 
-__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel)
-{
-  state[threadIdx.x].k = kernel;
-}
+//__global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel)
+//{
+//  state[threadIdx.x].k = kernel;
+//}
 
 __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state)
 {
   THCGenerator* gen = THCRandom_getGenerator(state);
   std::lock_guard<std::mutex> lock(gen->mutex);
 
-  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+//  static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  static const size_t states_size = MAX_NUM_BLOCKS;
   static const size_t seed_size = sizeof(gen->state.initial_seed);
   static const size_t offset_size = sizeof(gen->state.philox_seed_offset);
   static const size_t total_size = states_size + seed_size + offset_size;
@@ -82,10 +84,10 @@ __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state)
   }
   THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
 
-  THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state),
-                         states_size, cudaMemcpyHostToDevice));
-  set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, gen->state.kernel_params);
+//  THCudaCheck(cudaMemcpy(gen->state.gen_states, THByteTensor_data(rng_state),
+//                         states_size, cudaMemcpyHostToDevice));
+//  set_rngstate_kernel<<<1, MAX_NUM_BLOCKS, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, gen->state.kernel_params);
   memcpy(&gen->state.initial_seed, THByteTensor_data(rng_state) + states_size, seed_size);
   if (!no_philox_seed) {
     memcpy(&gen->state.philox_seed_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size);
@@ -121,7 +123,8 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1)      \
   int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                             \
   int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                \
   for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {      \
-    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                              \
+/*  CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);  */                          \
+    CURAND_T x = (CURAND_T) 0.0;                                               \
     if (i < size) {                                                            \
       T y = TRANSFORM;                                                         \
       result[i] = y;                                                           \
@@ -135,7 +138,8 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)
   int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                                   \
   int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                      \
   for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {            \
-    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                                    \
+/*  CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);  */                                \
+    CURAND_T x = (CURAND_T) 0.0;                                                     \
     if (i < size) {                                                                  \
       T y = TRANSFORM;                                                               \
       result[i] = y;                                                                 \
@@ -149,24 +153,24 @@ struct is_same { static const bool value = false; };
 template<typename T>
 struct is_same<T, T> { static const bool value = true; };
 
-template<typename real, typename prob_type>
-__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
-        real *result, prob_type *probs)
-{
-  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
-  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
-    if (is_same<prob_type, double>::value) {
-      double x = curand_uniform_double(&state[blockIdx.x]);
-      if (i < size)
-        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
-    } else {
-      float x = curand_uniform(&state[blockIdx.x]);
-      if (i < size)
-        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
-    }
-  }
-}
+//template<typename real, typename prob_type>
+//__global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
+//        real *result, prob_type *probs)
+//{
+//  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+//  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+//  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+//    if (is_same<prob_type, double>::value) {
+//      double x = curand_uniform_double(&state[blockIdx.x]);
+//      if (i < size)
+//        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+//    } else {
+//      float x = curand_uniform(&state[blockIdx.x]);
+//      if (i < size)
+//        result[i] = ScalarConvert<bool, real>::to(x <= probs[i]);
+//    }
+//  }
+//}
 
 // NOTE: curand_uniform is (0, 1] and we want [a, b)
 GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh
index 7749f231c5c771..25ea3f340e8db2 100644
--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@@ -5,37 +5,37 @@
 #include "THCReduceApplyUtils.cuh"
 #include "THCTensorMathReduce.cuh"
 
-#include <curand_kernel.h>
+//#include <curand_kernel.h>
 
 #define MAX_NUM_BLOCKS 200 
 #define BLOCK_SIZE 256
 /* Separate kernel because curand_log_normal gets extra parameters. */
 
-template <typename T>
-__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev)
-{
-  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
-  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
-    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
-    if (i < size) {
-      result[i] = ScalarConvert<float, T>::to(x);
-    }
-  }
-}
-
-template <>
-__global__ void generateLogNormal<double>(curandStateMtgp32 *state, int size, double *result, double mean, double stddev)
-{
-  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
-  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
-    double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev);
-    if (i < size) {
-      result[i] = x;
-    }
-  }
-}
+//template <typename T>
+//__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev)
+//{
+//  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+//  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+//  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+//    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
+//    if (i < size) {
+//      result[i] = ScalarConvert<float, T>::to(x);
+//    }
+//  }
+//}
+//
+//template <>
+//__global__ void generateLogNormal<double>(curandStateMtgp32 *state, int size, double *result, double mean, double stddev)
+//{
+//  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+//  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+//  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+//    double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev);
+//    if (i < size) {
+//      result[i] = x;
+//    }
+//  }
+//}
 
 template <typename T>
 __global__ void
@@ -289,89 +289,89 @@ sampleMultinomialOnce(int64_t* dest,
   }
 }
 
-template <typename T>
-__global__ void
-sampleMultinomialWithReplacement(curandStateMtgp32* state,
-                                 int totalSamples,
-                                 int64_t* dest,
-                                 int64_t distributions,
-                                 int categories,
-                                 T* normDistPrefixSum) {
-  // At the moment, each warp computes one sample value in the binary
-  // search due to divergence. It seems possible to compute multiple
-  // values and limit divergence though later on. However, no matter
-  // what, all block threads must participate in the curand_uniform
-  // call to update the generator state.
-
-  // The block determines the distribution for which we generate a point
-  for (int64_t curDist = blockIdx.x;
-       curDist < distributions;
-       curDist += gridDim.x) {
-    for (int sampleBase = 0;
-         sampleBase < totalSamples; sampleBase += blockDim.y) {
-      // The warp determines the sample
-      int sample = sampleBase + threadIdx.y;
-
-      // All threads participate in this
-      T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
-
-      if (threadIdx.x == 0 && sample < totalSamples) {
-        // Find the bucket that a uniform sample lies in
-        int choice = binarySearchForMultinomial<T>(
-          normDistPrefixSum + curDist * categories,
-          categories,
-          r);
-
-        // Torch indices are 1-based
-        dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void
-sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
-                                    int totalSamples,
-                                    int sample,
-                                    int64_t* dest,
-                                    int64_t distributions,
-                                    int categories,
-                                    T* origDist,
-                                    T* normDistPrefixSum) {
-  // At the moment, each warp computes one sample value in the binary
-  // search due to divergence. It seems possible to compute multiple
-  // values and limit divergence though later on. However, no matter
-  // what, all block threads must participate in the curand_uniform
-  // call to update the generator state.
-
-  // The block and warp determines the distribution for which we
-  // generate a point
-  for (int64_t curDistBase = blockIdx.x * blockDim.y;
-       curDistBase < distributions;
-       curDistBase += gridDim.x * blockDim.y) {
-    // The warp determines the distribution
-    int64_t curDist = curDistBase + threadIdx.y;
-
-    // All threads must participate in this
-    T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
-
-    if (threadIdx.x == 0 && curDist < distributions) {
-      // Find the bucket that a uniform sample lies in
-      int choice = binarySearchForMultinomial<T>(
-        normDistPrefixSum + curDist * categories,
-        categories,
-        r);
-
-      // Torch indices are 1-based
-      dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
-
-      // Without replacement, so update the original probability so it
-      // is not considered a second time
-      origDist[curDist * categories + choice] = ScalarConvert<int, T>::to(0);
-    }
-  }
-}
+//template <typename T>
+//__global__ void
+//sampleMultinomialWithReplacement(curandStateMtgp32* state,
+//                                 int totalSamples,
+//                                 int64_t* dest,
+//                                 int64_t distributions,
+//                                 int categories,
+//                                 T* normDistPrefixSum) {
+//  // At the moment, each warp computes one sample value in the binary
+//  // search due to divergence. It seems possible to compute multiple
+//  // values and limit divergence though later on. However, no matter
+//  // what, all block threads must participate in the curand_uniform
+//  // call to update the generator state.
+//
+//  // The block determines the distribution for which we generate a point
+//  for (int64_t curDist = blockIdx.x;
+//       curDist < distributions;
+//       curDist += gridDim.x) {
+//    for (int sampleBase = 0;
+//         sampleBase < totalSamples; sampleBase += blockDim.y) {
+//      // The warp determines the sample
+//      int sample = sampleBase + threadIdx.y;
+//
+//      // All threads participate in this
+//      T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+//
+//      if (threadIdx.x == 0 && sample < totalSamples) {
+//        // Find the bucket that a uniform sample lies in
+//        int choice = binarySearchForMultinomial<T>(
+//          normDistPrefixSum + curDist * categories,
+//          categories,
+//          r);
+//
+//        // Torch indices are 1-based
+//        dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+//      }
+//    }
+//  }
+//}
+//
+//template <typename T>
+//__global__ void
+//sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
+//                                    int totalSamples,
+//                                    int sample,
+//                                    int64_t* dest,
+//                                    int64_t distributions,
+//                                    int categories,
+//                                    T* origDist,
+//                                    T* normDistPrefixSum) {
+//  // At the moment, each warp computes one sample value in the binary
+//  // search due to divergence. It seems possible to compute multiple
+//  // values and limit divergence though later on. However, no matter
+//  // what, all block threads must participate in the curand_uniform
+//  // call to update the generator state.
+//
+//  // The block and warp determines the distribution for which we
+//  // generate a point
+//  for (int64_t curDistBase = blockIdx.x * blockDim.y;
+//       curDistBase < distributions;
+//       curDistBase += gridDim.x * blockDim.y) {
+//    // The warp determines the distribution
+//    int64_t curDist = curDistBase + threadIdx.y;
+//
+//    // All threads must participate in this
+//    T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+//
+//    if (threadIdx.x == 0 && curDist < distributions) {
+//      // Find the bucket that a uniform sample lies in
+//      int choice = binarySearchForMultinomial<T>(
+//        normDistPrefixSum + curDist * categories,
+//        categories,
+//        r);
+//
+//      // Torch indices are 1-based
+//      dest[curDist * totalSamples + sample] = choice + TH_INDEX_BASE;
+//
+//      // Without replacement, so update the original probability so it
+//      // is not considered a second time
+//      origDist[curDist * categories + choice] = ScalarConvert<int, T>::to(0);
+//    }
+//  }
+//}
 
 template <typename T>
 __global__ void
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index ef6ae9191585c8..c2d7d35df59f4d 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -15,8 +15,8 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, a, b);
+//  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, a, b);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -30,8 +30,8 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean,
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, mean, stdv);
+//  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, mean, stdv);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -69,8 +69,8 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generateLogNormal<real><<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, mean, stdv);
+//  generateLogNormal<real><<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, mean, stdv);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -85,8 +85,8 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, lambda);
+//  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, lambda);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -101,8 +101,8 @@ THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, median, sigma);
+//  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, median, sigma);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -240,13 +240,13 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
       // distribution concurrently.
       dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
 
-      sampleMultinomialWithReplacement
-        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-          gen->state.gen_states,
-          n_sample,
-          THCudaLongTensor_data(state, self),
-          numDist, numCategories,
-          THCTensor_(data)(state, prefixSum));
+//      sampleMultinomialWithReplacement
+//        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+//          gen->state.gen_states,
+//          n_sample,
+//          THCudaLongTensor_data(state, self),
+//          numDist, numCategories,
+//          THCTensor_(data)(state, prefixSum));
     } else {
       // Sample without replacement
 
@@ -273,15 +273,15 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
 
         // The kernel can only draw one sample before we have to
         // recalculate our distribution
-        sampleMultinomialWithoutReplacement
-          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-            gen->state.gen_states,
-            n_sample,
-            sample,
-            THCudaLongTensor_data(state, self),
-            numDist, numCategories,
-            THCTensor_(data)(state, origDist),
-            THCTensor_(data)(state, prefixSum));
+//        sampleMultinomialWithoutReplacement
+//          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+//            gen->state.gen_states,
+//            n_sample,
+//            sample,
+//            THCudaLongTensor_data(state, self),
+//            numDist, numCategories,
+//            THCTensor_(data)(state, origDist),
+//            THCTensor_(data)(state, prefixSum));
       }
     }
 
@@ -397,8 +397,8 @@ THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, p);
+//  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, p);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -428,8 +428,8 @@ THC_API void THCTensor_(NAME)(THCState* state,                                 \
                                                                                \
   THArgCheck(size == prob_size, 3, "inconsistent tensor size");                \
                                                                                \
-  generate_bernoulli_tensor<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( \
-      gen->state.gen_states, size, result_data, probs_data);                         \
+/*  generate_bernoulli_tensor<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( */ \
+/*      gen->state.gen_states, size, result_data, probs_data); */                        \
                                                                                \
   PROB_TYPE##_free(state, probs);                                              \
   THCTensor_(freeCopyTo)(state, self, self_);                                  \
@@ -468,8 +468,8 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   real *data = THCTensor_(data)(state, self);
 
-  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, p);
+//  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, p);
 
   THCTensor_(freeCopyTo)(state, self, self_);
 };
@@ -489,12 +489,12 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
 
 #if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT)
   if (range > 1ULL << 32) {
-    generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-        gen->state.gen_states, size, data, min_val, range);
+//    generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//        gen->state.gen_states, size, data, min_val, range);
   } else {
 #endif
-    generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-        gen->state.gen_states, size, data, min_val, range);
+//    generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//        gen->state.gen_states, size, data, min_val, range);
 #if defined(THC_REAL_IS_LONG) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_FLOAT)
   }
 #endif
@@ -519,20 +519,20 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
   real *data = THCTensor_(data)(state, self);
 
 #if defined(THC_REAL_IS_HALF)
-  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
+//  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
 #elif defined(THC_REAL_IS_FLOAT)
-  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, 0UL, (1UL << FLT_MANT_DIG) + 1);
+//  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, 0UL, (1UL << FLT_MANT_DIG) + 1);
 #elif defined(THC_REAL_IS_DOUBLE)
-  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, 0ULL, (1ULL << DBL_MANT_DIG) + 1);
+//  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, 0ULL, (1ULL << DBL_MANT_DIG) + 1);
 #elif defined(THC_REAL_IS_LONG)
-  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, 0ULL, static_cast<uint64_t>(std::numeric_limits<real>::max()) + 1);
+//  generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, 0ULL, static_cast<uint64_t>(std::numeric_limits<real>::max()) + 1);
 #else
-  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
+//  generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+//      gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
 #endif
 
   THCTensor_(freeCopyTo)(state, self, self_);
diff --git a/aten/src/THCS/THCSparse.cu b/aten/src/THCS/THCSparse.cu
index 90b9bc50edcd7c..fae3835fd587bf 100644
--- a/aten/src/THCS/THCSparse.cu
+++ b/aten/src/THCS/THCSparse.cu
@@ -11,15 +11,17 @@ void THCudaSparse_Xcoo2csr(THCState *state, const int *coorowind, int64_t nnz, i
   ));
 }
 
-cusparseOperation_t convertTransToCusparseOperation(char trans) {
-  if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE;
-  else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE;
-  else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
-  else {
-    THError("trans must be one of: t, n, c");
-    return CUSPARSE_OPERATION_TRANSPOSE;
-  }
-}
+/*
+//cusparseOperation_t convertTransToCusparseOperation(char trans) {
+//  if (trans == 't') return CUSPARSE_OPERATION_TRANSPOSE;
+//  else if (trans == 'n') return CUSPARSE_OPERATION_NON_TRANSPOSE;
+//  else if (trans == 'c') return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+//  else {
+//    THError("trans must be one of: t, n, c");
+//    return CUSPARSE_OPERATION_TRANSPOSE;
+//  }
+//}
+*/
 
 void adjustLd(char transb, int64_t m, int64_t n, int64_t k, int64_t *ldb, int64_t *ldc)
 {
diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu
index bf4503515c4342..a84e0d70e4b797 100644
--- a/aten/src/THCUNN/RReLU.cu
+++ b/aten/src/THCUNN/RReLU.cu
@@ -3,54 +3,54 @@
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "common.h"
-#include <curand.h>
-#include <curand_kernel.h>
+//#include <curand.h>
+//#include <curand_kernel.h>
 
 // copied from cutorch/lib/THC/THCTensorRandom.cu
 #define MAX_NUM_BLOCKS 64
 #define BLOCK_SIZE 256
 #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
 
-template<typename T>
-inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
-  return ScalarConvert<float, half>::to(curand_uniform(state));
-}
-#endif
-
-template <>
-inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
-  return curand_uniform(state);
-}
-
-template <>
-inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
-  return curand_uniform_double(state);
-}
-
-template <typename T>
-__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
-  T *input, T* noise, T *output, double a, double b)
-{
-  CUDA_KERNEL_LOOP(i, n)
-  {
-    if (input[i] <= 0)
-    {
-      T r = curand_uniform_type<T>(&state[blockIdx.x]);
-      r = ScalarConvert<double, T>::to(r * (b-a) + a);
-      output[i] = input[i] * r;
-      noise[i] = r;
-    }
-    else
-    {
-      output[i] = input[i];
-      noise[i] = ScalarConvert<int, T>::to(1);
-    }
-  }
-}
+//template<typename T>
+//inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
+//
+//#ifdef CUDA_HALF_TENSOR
+//template <>
+//inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
+//  return ScalarConvert<float, half>::to(curand_uniform(state));
+//}
+//#endif
+//
+//template <>
+//inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
+//  return curand_uniform(state);
+//}
+//
+//template <>
+//inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
+//  return curand_uniform_double(state);
+//}
+//
+//template <typename T>
+//__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
+//  T *input, T* noise, T *output, double a, double b)
+//{
+//  CUDA_KERNEL_LOOP(i, n)
+//  {
+//    if (input[i] <= 0)
+//    {
+//      T r = curand_uniform_type<T>(&state[blockIdx.x]);
+//      r = ScalarConvert<double, T>::to(r * (b-a) + a);
+//      output[i] = input[i] * r;
+//      noise[i] = r;
+//    }
+//    else
+//    {
+//      output[i] = input[i];
+//      noise[i] = ScalarConvert<int, T>::to(1);
+//    }
+//  }
+//}
 
 template <typename T>
 struct RReLUUpdateOutputEval_functor
diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu
index cd9b6590851c1d..90f626a0bf2e0a 100644
--- a/aten/src/THCUNN/SparseLinear.cu
+++ b/aten/src/THCUNN/SparseLinear.cu
@@ -3,17 +3,17 @@
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
-#include <cusparse.h>
+//#include <cusparse.h>
 
-static cusparseHandle_t cusparse_handle = 0;
+//static cusparseHandle_t cusparse_handle = 0;
 
 static void init_cusparse() {
-  if (cusparse_handle == 0) {
-    cusparseStatus_t status = cusparseCreate(&cusparse_handle);
-    if (status != CUSPARSE_STATUS_SUCCESS) {
-      THError("CUSPARSE Library initialization failed");
-    }
-  }
+//  if (cusparse_handle == 0) {
+//    cusparseStatus_t status = cusparseCreate(&cusparse_handle);
+//    if (status != CUSPARSE_STATUS_SUCCESS) {
+//      THError("CUSPARSE Library initialization failed");
+//    }
+//  }
 }
 
 #ifdef CUDA_HALF_TENSOR
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 462120c014271c..b755196dc98cd8 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -496,8 +496,8 @@ endif()
 if(USE_ROCM AND NOT BUILD_CAFFE2)
  include_directories(SYSTEM ${HIP_PATH}/include)
  include_directories(SYSTEM ${HIPBLAS_PATH}/include)
- include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
- include_directories(SYSTEM ${HIPRNG_PATH}/include)
+# include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
+# include_directories(SYSTEM ${HIPRNG_PATH}/include)
  include_directories(SYSTEM ${THRUST_PATH})
 
  # load HIP cmake module and load platform id
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 559b2e23855c95..80b751b03021db 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -46,18 +46,18 @@ ELSE()
 ENDIF()
 
 # HIPRNG_PATH
-IF(NOT DEFINED ENV{HIPRNG_PATH})
-  SET(HIPRNG_PATH ${ROCM_PATH}/hcrng)
-ELSE()
-  SET(HIPRNG_PATH $ENV{HIPRNG_PATH})
-ENDIF()
+#IF(NOT DEFINED ENV{HIPRNG_PATH})
+#  SET(HIPRNG_PATH ${ROCM_PATH}/hcrng)
+#ELSE()
+#  SET(HIPRNG_PATH $ENV{HIPRNG_PATH})
+#ENDIF()
 
 # HIPSPARSE_PATH
-IF(NOT DEFINED ENV{HIPSPARSE_PATH})
-  SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
-ELSE()
-  SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH})
-ENDIF()
+#IF(NOT DEFINED ENV{HIPSPARSE_PATH})
+#  SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
+#ELSE()
+#  SET(HIPSPARSE_PATH $ENV{HIPSPARSE_PATH})
+#ENDIF()
 
 # THRUST_PATH
 IF(DEFINED ENV{THRUST_PATH})
@@ -114,7 +114,7 @@ IF(HIP_FOUND)
   set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
   set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
   set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
-  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
+  #set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
 
   find_package(rocrand REQUIRED)
   find_package(hiprand REQUIRED)
@@ -132,9 +132,9 @@ IF(HIP_FOUND)
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
   FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
-  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
+#  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
   FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
-  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
+#  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
 
 
   # Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers.
diff --git a/setup.py b/setup.py
index ae99e2b94f1c1f..3c4bb7ef7be31b 100644
--- a/setup.py
+++ b/setup.py
@@ -896,13 +896,13 @@ def run(self):
     rocm_include_path = '/opt/rocm/include'
     hcc_include_path = '/opt/rocm/hcc/include'
     hipblas_include_path = '/opt/rocm/hipblas/include'
-    hipsparse_include_path = '/opt/rocm/hcsparse/include'
+    #hipsparse_include_path = '/opt/rocm/hcsparse/include'
     hip_lib_path = '/opt/rocm/hip/lib'
     hcc_lib_path = '/opt/rocm/hcc/lib'
     include_dirs.append(rocm_include_path)
     include_dirs.append(hcc_include_path)
     include_dirs.append(hipblas_include_path)
-    include_dirs.append(hipsparse_include_path)
+    #include_dirs.append(hipsparse_include_path)
     include_dirs.append(tmp_install_path + "/include/THCUNN")
     extra_link_args.append('-L' + hip_lib_path)
     extra_link_args.append('-Wl,-rpath,' + hip_lib_path)
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index 52a04acbe5d4ba..53113900b4dc74 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -50,7 +50,7 @@
             if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
                 continue
             # Update contents.
-            with open(source, "r+") as f:
+            with open(source, "r+", encoding="utf-8") as f:
                 contents = f.read()
                 contents = contents.replace("USE_CUDA", "USE_ROCM")
                 contents = contents.replace("CUDA_VERSION", "0")
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index c3d993cfbb35cd..9d121fe50c9f51 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -118,7 +118,7 @@ PyObject * THCPModule_getRNGState(PyObject *_unused)
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
   auto tensor = VariableType::getType(CPU(kByte))->tensor();
-  THCRandom_getRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
+//  THCRandom_getRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
   return THPVariable_Wrap(tensor);
   END_HANDLE_TH_ERRORS
 }
@@ -131,7 +131,7 @@ PyObject * THCPModule_setRNGState(PyObject *_unused, PyObject *obj)
         Py_TYPE(obj)->tp_name);
   }
   auto& tensor = THPVariable_UnpackData(obj);
-  THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
+//  THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTH(false));
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }