forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 68
Commit 0b331fd
[CUDA] Abate
Avoids excessively spammy warnings such as
```
pytorch/aten/src/ATen/native/cuda/SoftMax.cu(844): warning #191-D: type qualifier is meaningless on cast type
[&] { const auto& the_type = input.scalar_type(); constexpr const char* at_dispatch_name = "host_softmax"; at::ScalarType _st = ::detail::scalar_type(the_type); ; switch (_st) { case at::ScalarType::Double: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Double)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Double), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Double>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Float: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Float)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Float), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Float>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::Half: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::Half)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::Half), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::Half>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } case at::ScalarType::BFloat16: { do { if constexpr (!at::should_include_kernel_dtype( at_dispatch_name, at::ScalarType::BFloat16)) { do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str("dtype '", toString(at::ScalarType::BFloat16), "' not selected for kernel tag ", at_dispatch_name)))); }; } while (false); } } while (0); using scalar_t __attribute__((__unused__)) = c10::impl::ScalarTypeToCPPTypeT<at::ScalarType::BFloat16>; return [&] { using accscalar_t = acc_type<scalar_t, true>; if (!half_to_float) { auto output_ptr = output.mutable_data_ptr<scalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(880), true); } while (0); } } else { auto output_ptr = output.mutable_data_ptr<accscalar_t>(); auto input_ptr = input.const_data_ptr<scalar_t>(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>( output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr ); input_ptr += chunk_size * dim_size; output_ptr += chunk_size * dim_size; remaining -= chunk_size; } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); dim3 block = SoftMaxForward_getBlockSize(dim_size); size_t smem_reduction_sz = block.x / 32 * sizeof(accscalar_t); auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - smem_reduction_sz) / sizeof(scalar_t); bool can_use_smem = dim_size < max_elements_per_smem; can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES); can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES)); can_use_smem &= !(dim_size % ILP); if (can_use_smem) { size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size); } else { cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue> <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size); } do { const cudaError_t __err = cudaGetLastError(); c10::cuda::c10_cuda_check_implementation( static_cast<int32_t>(__err), "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", __func__, static_cast<uint32_t>(916), true); } while (0); } } }(); } default: do { ::c10::detail::deprecated_AT_ERROR(); if (!(false)) { ::c10::detail::torchCheckFail( __func__, "/workspace/pytorch/aten/src/ATen/native/cuda/SoftMax.cu", static_cast<uint32_t>(844), (::c10::detail::torchCheckMsgImpl( "Expected " "false" " to be true, but got false. " "(Could this error message be improved? If so, " "please report an enhancement request to PyTorch.)", ::c10::str('"', at_dispatch_name, "\" not implemented for '", toString(_st), "'")))); }; } while (false); } }()
```
and
```
SoftMax.cu:844: warning: comparison of integer expressions of different signedness: ‘int64_t’ {aka ‘long int’} and ‘long unsigned int’ [-Wsign-compare]
```
Pull Request resolved: pytorch#128468
Approved by: https://github.com/valentinandreiSoftMax.cu
compiler warning spam (pytorch#128468)1 parent 8b3daf1 commit 0b331fdCopy full SHA for 0b331fd
1 file changed
+4
-4
lines changedaten/src/ATen/native/cuda/SoftMax.cu
Copy file name to clipboardExpand all lines: aten/src/ATen/native/cuda/SoftMax.cu+4-4Lines changed: 4 additions & 4 deletions
Original file line number | Diff line number | Diff line change | |
---|---|---|---|
| |||
863 | 863 |
| |
864 | 864 |
| |
865 | 865 |
| |
866 |
| - | |
867 |
| - | |
| 866 | + | |
| 867 | + | |
868 | 868 |
| |
869 | 869 |
| |
870 | 870 |
| |
| |||
899 | 899 |
| |
900 | 900 |
| |
901 | 901 |
| |
902 |
| - | |
903 |
| - | |
| 902 | + | |
| 903 | + | |
904 | 904 |
| |
905 | 905 |
| |
906 | 906 |
| |
|
0 commit comments