diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh index da9b1539d9094c..f00b57df8dd999 100644 --- a/aten/src/THC/THCTensorMathReduce.cuh +++ b/aten/src/THC/THCTensorMathReduce.cuh @@ -121,7 +121,6 @@ __global__ void THCTensor_kernel_renorm(T *data, buffer[tx] = scalar_cast(0); AccT norm; -#if !defined(__HIP_DEVICE_COMPILE__) if (THCNumerics::eq(value, scalar_cast(INFINITY))) { // get norm of axis for (ptrdiff_t i = tx; i < size; i += step) { @@ -168,7 +167,6 @@ __global__ void THCTensor_kernel_renorm(T *data, row[i] = scalar_cast(THCNumerics::mul(val, norm)); } } -#endif } template diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index 91319745b6a1de..958bf623b85f05 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -70,6 +70,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t va if (numel > 0) { ptrdiff_t size = numel / THTensor_sizeLegacyNoScalars(data, 0); dim3 grid( THTensor_sizeLegacyNoScalars(data, 0)); + // NOTE: only with this specific number of threads can this work on GPUs with a warp size != 32 (such as AMD). Do not alter w/o changing buffer size in kernel. dim3 threads(32); THCTensor_kernel_renorm