diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 1fd151c1402495..006dfd11b52295 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -14,11 +14,7 @@ namespace at { namespace native { -#ifdef __HIP_PLATFORM_HCC__ -constexpr int CAT_ARRAY_BATCH_SIZE = 1024; -#else constexpr int CAT_ARRAY_BATCH_SIZE = 128; -#endif constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4; namespace { @@ -85,45 +81,6 @@ struct TensorSizeStride { */ -// Use pinned memory and and pass the struct by pointer on ROCm -template -struct CatArrInputTensor { - T* input; - IndexType offset; - IndexType dimSize; - IndexType nElements; -}; - -template -C10_LAUNCH_BOUNDS_1(512) -__global__ void HIP_CatArrayBatchedCopy( - T* output, - CatArrInputTensor* inputs, - TensorSizeStride os, - const int concatDim, - IndexType dimStride) { - - IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; - IndexType nElements = inputs[blockIdx.y].nElements; - - if(tid >= nElements) return; - - T* data = inputs[blockIdx.y].input; - IndexType offset = inputs[blockIdx.y].offset; - IndexType dimSize = inputs[blockIdx.y].dimSize; - IndexType dataOffset = offset * dimStride; - - IndexType stride = gridDim.x * blockDim.x; - - while( tid < nElements){ - IndexType elementOffset = CatArrIndexToOffset::compute( - os.tensorSize, os.tensorStride, dimSize, concatDim, tid); - output[dataOffset + elementOffset] = data[tid]; - - tid += stride; - } -} - // pass meta data directly through kernel argument instead of pin memory // In contiguous case, we will not need stride_size, setting it as 1 as placeholder // to pass compile. @@ -173,127 +130,6 @@ __global__ void CatArrayBatchedCopy( } } -template -void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, - int nDims, c10::MemoryFormat memory_format) { - // First, let's set up our kernel parameters. We start with a raw pointer to - // the storage for the output Tensor. - scalar_t *data = out.data_ptr(); - - // Kernel Parameter - long tensorMetadataSize = - sizeof(CatArrInputTensor) * CAT_ARRAY_BATCH_SIZE; - auto d_inputs_storage = at::empty( - {tensorMetadataSize}, out.options().dtype(at::kByte)); - auto d_inputs = static_cast *>( - d_inputs_storage.data_ptr()); - - TensorSizeStride outputParam; - - // Next, let's initialize the size, stride arrays for the output Tensor. - if (memory_format == c10::MemoryFormat::Contiguous) { - for (int i = 0; i < nDims; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i); - outputParam.tensorStride[i] = out.stride(i); - } - } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { - // permute the semantics of dims from NCHW to NHWC so that the input - // tensor is now contiguous - outputParam.tensorSize[0] = at::native::size(out, 0); - outputParam.tensorStride[0] = out.stride(0); - for (int i = 1; i < nDims - 1; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i + 1); - outputParam.tensorStride[i] = out.stride(i + 1); - } - outputParam.tensorSize[nDims - 1] = at::native::size(out, 1); - outputParam.tensorStride[nDims - 1] = out.stride(1); - } else { - TORCH_CHECK(false, "unsupported memory format"); - } - - at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); - - // Now we loop - int batchCounter = 0; - int64_t offset = 0; - for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) { - // Re-allocate stackInputs every iteration to avoid read-after-write hazard - { - auto stackInputs_storage = at::empty({tensorMetadataSize}, - out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true)); - auto stackInputs = - static_cast *>( - stackInputs_storage.data_ptr()); - for (batchCounter = 0; - batchCounter < CAT_ARRAY_BATCH_SIZE && - (i+batchCounter) < inputs.size(); - ++batchCounter) { - int64_t dimSize = 0; - // There is a legacy case where a 1-D empty tensor can be concat with - // high-dimensional tensor - if (inputs[i+batchCounter].numel() > 0) { - dimSize = at::native::size(inputs[i+batchCounter], dimension); - } - - stackInputs[batchCounter].input = - inputs[i+batchCounter].data_ptr(); - stackInputs[batchCounter].offset = offset; - stackInputs[batchCounter].dimSize = dimSize; - stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel(); - - // update offset - offset += dimSize; - } - at::native::copy_(d_inputs_storage, stackInputs_storage, - /* non_blocking= */ true); - } - - // Next, let's consider how we set our kernel launch parameters. - // We borrow from THCApply, which the kernel's internal indexing - // is based on. - dim3 applyBlock = dim3(32*16); - - //Get grid where x dim fills half gpu and y dim is number of tensors. - //This will have cating two tensors fill the entire grid, but prevent - //many threads from needlessly load meta data if their sizes is small. - dim3 catGrid; - getCatGrid(batchCounter, catGrid); - - if (memory_format != c10::MemoryFormat::Contiguous) { - switch (dimension) { - case 0: - break; - case 1: - dimension = nDims - dimension; - break; - default: - dimension--; - } - } - // Template Declarations for dim = 1, 2, 3, 4 -#define HANDLE_CASE(DIMS) \ - HIP_CatArrayBatchedCopy<<<\ - catGrid, applyBlock, 0, stream.stream()>>>(\ - data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - switch (nDims) { - case 1: - HANDLE_CASE(1); - break; - case 2: - HANDLE_CASE(2); - break; - case 3: - HANDLE_CASE(3); - break; - case 4: - HANDLE_CASE(4); - break; - } -#undef HANDLE_CASE - } -} - template void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, int nDims, c10::MemoryFormat memory_format) { @@ -546,19 +382,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { }); allSameType = allSameType && (out.scalar_type() == firstType); -#ifdef __HIP_PLATFORM_HCC__ - if (inputs.size() > 1 && - out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && - at::cuda::detail::canUse32BitIndexMath(out) && - allContiguous && - all32BitIndexable && - allSameType) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, - out.scalar_type(), "cat_cuda", [&]() { - hip_parallel_cat(out, inputs, dimension, nDims, memory_format); - }); -#else // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant // memory. Therefore, we could pass more inputs to cuda threads. @@ -587,7 +410,6 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { out.scalar_type(), "cat_cuda", [&]() { parallel_cat(out, inputs, dimension, nDims, memory_format); }); -#endif } else { int64_t offset = 0; for (int j = 0; j < inputs.size(); j++)