From bb39b47e125b1ccee03e0ee9c6e92e40d6688945 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 2 May 2023 08:42:25 -0700 Subject: [PATCH 01/22] Basic cuda support for opportunistic_group, fixed_size_group, and ballot_group. Signed-off-by: JackAKirk --- libclc/ptx-nvidiacl/libspirv/SOURCES | 2 +- .../{group_ballot.cl => group_non_uniform.cl} | 7 +++++ .../ext/oneapi/experimental/ballot_group.hpp | 4 +-- .../oneapi/experimental/fixed_size_group.hpp | 26 +++++++++++++++++++ .../experimental/non_uniform_groups.hpp | 7 ++++- .../experimental/opportunistic_group.hpp | 12 ++++++--- .../NonUniformGroups/ballot_group.cpp | 2 +- .../NonUniformGroups/fixed_size_group.cpp | 2 +- .../NonUniformGroups/opportunistic_group.cpp | 2 +- 9 files changed, 53 insertions(+), 11 deletions(-) rename libclc/ptx-nvidiacl/libspirv/group/{group_ballot.cl => group_non_uniform.cl} (80%) diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index bec378d428511..4177aae12b416 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -93,7 +93,7 @@ images/image_helpers.ll images/image.cl group/collectives_helpers.ll group/collectives.cl -group/group_ballot.cl +group/group_non_uniform.cl atomic/atomic_add.cl atomic/atomic_and.cl atomic/atomic_cmpxchg.cl diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl similarity index 80% rename from libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl rename to libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl index 33285028b7b39..50826d9bf53e2 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "membermask.h" +#include #include #include @@ -34,3 +35,9 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) { return res; } + +_CLC_DEF _CLC_CONVERGENT uint +_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j( + uint scope, uint flag, __clc_vec4_uint32_t mask) { + return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]); +} diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index fcdce42652075..f1022db9ebedb 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -121,8 +121,8 @@ template class ballot_group { friend ballot_group get_ballot_group(ParentGroup g, bool predicate); - friend uint32_t sycl::detail::IdToMaskPosition>( - ballot_group Group, uint32_t Id); + friend sub_group_mask sycl::detail::GetMask>( + ballot_group Group); }; template diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index 3c2a1b07b74d7..ea8c9e010ae04 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -112,10 +112,23 @@ template class fixed_size_group { } protected: + +#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + sub_group_mask Mask; +#endif + +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + fixed_size_group(ext::oneapi::sub_group_mask mask):Mask(mask) {} +#else fixed_size_group() {} +#endif friend fixed_size_group get_fixed_size_group(ParentGroup g); + + friend sub_group_mask + sycl::detail::GetMask>( + fixed_size_group Group); }; template @@ -125,7 +138,20 @@ inline std::enable_if_t> && get_fixed_size_group(Group group) { (void)group; #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + uint32_t loc_id = group.get_local_linear_id(); + uint32_t loc_size = group.get_local_linear_range(); + uint32_t bits = PartitionSize == 32 + ? 0xffffffff + : ((1 << PartitionSize) - 1) + << ((loc_id / PartitionSize) * PartitionSize); + + return fixed_size_group( + sycl::detail::Builder::createSubGroupMask( + bits, loc_size)); +#else return fixed_size_group(); +#endif #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 0c31f9ad2290f..5e7befde49a6e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -39,10 +39,15 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif +template +inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { + return Group.Mask; +} + template inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { // TODO: This will need to be optimized - sycl::vec MemberMask = ExtractMask(Group.Mask); + sycl::vec MemberMask = ExtractMask(GetMask(Group)); uint32_t Count = 0; for (int i = 0; i < 4; ++i) { for (int b = 0; b < 32; ++b) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index a1c08a35b399b..b51c8f02ab8dc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -118,9 +118,8 @@ class opportunistic_group { friend opportunistic_group this_kernel::get_opportunistic_group(); - friend uint32_t - sycl::detail::IdToMaskPosition(opportunistic_group Group, - uint32_t Id); + friend sub_group_mask + sycl::detail::GetMask(opportunistic_group Group); }; namespace this_kernel { @@ -133,7 +132,12 @@ inline opportunistic_group get_opportunistic_group() { sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true); return opportunistic_group(mask); #elif defined(__NVPTX__) - // TODO: Construct from __activemask + uint32_t active_mask; + asm volatile("activemask.b32 %0;" : "=r"(active_mask)); + sub_group_mask mask = + sycl::detail::Builder::createSubGroupMask( + active_mask, 32); + return opportunistic_group(mask); #endif #else throw runtime_error("Non-uniform groups are not supported on host device.", diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp index 98fd7174208e5..86abbf0906224 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cpu || cuda || hip +// UNSUPPORTED: cpu || hip #include #include diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp index b4f7e96531076..b3d2e21f7eb17 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cpu || cuda || hip +// UNSUPPORTED: cpu || hip #include #include diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp index 925340cee1c6d..67b68ec31289b 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -1,7 +1,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // -// UNSUPPORTED: cpu || cuda || hip +// UNSUPPORTED: cpu || hip #include #include From 4ca058c1d206094282eb6e38bf854717d7ebd5d4 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 5 May 2023 12:25:54 -0700 Subject: [PATCH 02/22] Fix test failure, add comment in libclc. Signed-off-by: JackAKirk --- libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl | 2 ++ sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp | 4 ++-- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl index 50826d9bf53e2..7e5f81850c352 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl @@ -39,5 +39,7 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) { _CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j( uint scope, uint flag, __clc_vec4_uint32_t mask) { + // here we assume scope == __spv::Scope::Subgroup && flag == + // (int)__spv::GroupOperation::ExclusiveScan which is the only current use case return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]); } diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp index cc19edc820f70..e3e8a077edf2d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp @@ -119,8 +119,8 @@ template class tangle_group { friend tangle_group get_tangle_group(ParentGroup); - friend uint32_t sycl::detail::IdToMaskPosition>( - tangle_group Group, uint32_t Id); + friend sub_group_mask sycl::detail::GetMask>( + tangle_group Group); }; template diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index af86bdafae9d6..90e7d85e9ef69 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -94,11 +94,15 @@ struct sub_group_mask { bool none() const { return count() == 0; } uint32_t count() const { unsigned int count = 0; + #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits)); + #else auto word = (Bits & valuable_bits(bits_num)); while (word) { word &= (word - 1); count++; } + #endif return count; } uint32_t size() const { return bits_num; } From 369f25f48c9672973cf5016cd0e14a1bf640b4ff Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 9 May 2023 14:27:48 -0700 Subject: [PATCH 03/22] format Signed-off-by: JackAKirk --- .../sycl/ext/oneapi/experimental/fixed_size_group.hpp | 4 ++-- .../sycl/ext/oneapi/experimental/opportunistic_group.hpp | 4 ++-- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index ea8c9e010ae04..bd4de46945766 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -113,12 +113,12 @@ template class fixed_size_group { protected: -#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) sub_group_mask Mask; #endif #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - fixed_size_group(ext::oneapi::sub_group_mask mask):Mask(mask) {} + fixed_size_group(ext::oneapi::sub_group_mask mask) : Mask(mask) {} #else fixed_size_group() {} #endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index b51c8f02ab8dc..016aed88b555c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -135,8 +135,8 @@ inline opportunistic_group get_opportunistic_group() { uint32_t active_mask; asm volatile("activemask.b32 %0;" : "=r"(active_mask)); sub_group_mask mask = - sycl::detail::Builder::createSubGroupMask( - active_mask, 32); + sycl::detail::Builder::createSubGroupMask( + active_mask, 32); return opportunistic_group(mask); #endif #else diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 90e7d85e9ef69..f1bfe5aec20c2 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -94,15 +94,15 @@ struct sub_group_mask { bool none() const { return count() == 0; } uint32_t count() const { unsigned int count = 0; - #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits)); - #else +#else auto word = (Bits & valuable_bits(bits_num)); while (word) { word &= (word - 1); count++; } - #endif +#endif return count; } uint32_t size() const { return bits_num; } From f443f8170903517d1d4f954356038c0fb7670f09 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 9 May 2023 14:33:23 -0700 Subject: [PATCH 04/22] format Signed-off-by: JackAKirk --- sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index bd4de46945766..de45d5ba3bdef 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -112,7 +112,6 @@ template class fixed_size_group { } protected: - #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) sub_group_mask Mask; #endif From 3069a1e8fee7de23554c69537a18e05afd067163 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 03:33:25 -0700 Subject: [PATCH 05/22] Optimized `IdToMaskPosition` NVPTX case. Signed-off-by: JackAKirk --- .../sycl/ext/oneapi/experimental/non_uniform_groups.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 5e7befde49a6e..b099787f0fa25 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -46,8 +46,11 @@ inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { template inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { - // TODO: This will need to be optimized sycl::vec MemberMask = ExtractMask(GetMask(Group)); +#if defined(__NVPTX__) + return __nvvm_fns(MemberMask[0], 0, Id + 1); +#else + // TODO: This will need to be optimized uint32_t Count = 0; for (int i = 0; i < 4; ++i) { for (int b = 0; b < 32; ++b) { @@ -60,6 +63,7 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { } } return Count; +#endif } } // namespace detail From 0b1b82a4fb5052c8a034bced96b19ce38ac951a9 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 07:27:08 -0700 Subject: [PATCH 06/22] barrier, broadcast, any_of, all_of, none_of impls Signed-off-by: JackAKirk --- sycl/include/sycl/detail/spirv.hpp | 58 +++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 85cb4834552d4..0a65ba9e781ec 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -129,6 +130,9 @@ template bool GroupAll(Group, bool pred) { template bool GroupAll(ext::oneapi::experimental::ballot_group g, bool pred) { +#if defined(__NVPTX__) + return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -137,26 +141,35 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAll(group_scope::value, pred); } +#endif } template bool GroupAll( - ext::oneapi::experimental::fixed_size_group, + ext::oneapi::experimental::fixed_size_group g, bool pred) { +#if defined(__NVPTX__) + return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), static_cast(pred), PartitionSize); +#endif } template bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } -template -bool GroupAll(const ext::oneapi::experimental::opportunistic_group &, + +bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g, bool pred) { +#if defined(__NVPTX__) + return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else return __spirv_GroupNonUniformAll( group_scope::value, pred); +#endif } template bool GroupAny(Group, bool pred) { @@ -165,6 +178,9 @@ template bool GroupAny(Group, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { +#if defined (__NVPTX__) + return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -173,25 +189,34 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAny(group_scope::value, pred); } +#endif } template bool GroupAny( - ext::oneapi::experimental::fixed_size_group, + ext::oneapi::experimental::fixed_size_group g, bool pred) { +#if defined (__NVPTX__) + return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), static_cast(pred), PartitionSize); +#endif } template bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } -bool GroupAny(const ext::oneapi::experimental::opportunistic_group &, +bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g, bool pred) { +#if defined (__NVPTX__) + return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); +#else return __spirv_GroupNonUniformAny( group_scope::value, pred); +#endif } // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic @@ -269,7 +294,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); - +#if defined(__NVPTX__) + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); +#else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -289,6 +316,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); } +#endif } template EnableIfNativeBroadcast GroupBroadcast( @@ -296,7 +324,9 @@ EnableIfNativeBroadcast GroupBroadcast( T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; - +#if defined(__NVPTX__) + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); +#else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -312,6 +342,7 @@ EnableIfNativeBroadcast GroupBroadcast( // partition, and it's unclear which will be faster in practice. return __spirv_GroupNonUniformShuffle(group_scope::value, OCLX, OCLId); +#endif } template EnableIfNativeBroadcast @@ -338,7 +369,9 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, IdT local_id) { // Remap local_id to its original numbering in sub-group auto LocalId = detail::IdToMaskPosition(g, local_id); - +#if defined(__NVPTX__) + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); +#else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -351,6 +384,7 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, return __spirv_GroupNonUniformBroadcast( group_scope::value, OCLX, OCLId); +#endif } template @@ -1022,8 +1056,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { template typename std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v> -ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { -#if defined(__SPIR__) +ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { +#if defined(__NVPTX__) + __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]); +#else // SPIR-V does not define an instruction to synchronize partial groups. // However, most (possibly all?) of the current SPIR-V targets execute // work-items in lockstep, so we can probably get away with a MemoryBarrier. @@ -1033,8 +1069,6 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { __spv::MemorySemanticsMask::SubgroupMemory | __spv::MemorySemanticsMask::WorkgroupMemory | __spv::MemorySemanticsMask::CrossWorkgroupMemory); -#elif defined(__NVPTX__) - // TODO: Call syncwarp with appropriate mask extracted from the group #endif } From 5a9668b3ed3f572208392fe75f41b519e23a35fd Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 07:41:53 -0700 Subject: [PATCH 07/22] reduce/scan impls. Signed-off-by: JackAKirk --- .../cuda/non_uniform_algorithms.hpp | 383 ++++++++++++++++++ 1 file changed, 383 insertions(+) create mode 100644 sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp new file mode 100644 index 0000000000000..6a072b1f21ece --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -0,0 +1,383 @@ +//==----- non_uniform_algorithms.hpp - cuda masked subgroup algorithms -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { + namespace detail { + +template +using IsRedux = + std::bool_constant::value && + sycl::detail::IsBitAND::value || + sycl::detail::IsBitOR::value || + sycl::detail::IsBitXOR::value || + sycl::detail::IsPlus::value || + sycl::detail::IsMinimum::value || + sycl::detail::IsMaximum::value>; + +#ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + +//// Masked reductions using redux.sync, requires integer types + +template +std::enable_if_t::value && + sycl::detail::IsMinimum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_umin(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMinimum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_min(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMaximum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_umax(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMaximum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_max(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsPlus::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_add(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitAND::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_and(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitOR::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_or(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitXOR::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_xor(x, MemberMask); +} +//// + +//// Shuffle based masked reduction impls + +// Cluster group reduction using shfls, T = double +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_fixed_size_group_v && + std::is_same_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i); + auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i); + double tmp; + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(tmp) + : "r"(tmp_a), "r"(tmp_b)); + x = binary_op(x, tmp); + } + + return x; +} + +// Cluster group reduction using shfls, T = float +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_fixed_size_group_v && + std::is_same_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + auto tmp = + __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i); + x = binary_op(x, __nvvm_bitcast_i2f(tmp)); + } + return x; +} + +// Cluster group reduction using shfls, std::is_integral_v +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_fixed_size_group_v && + std::is_integral_v, + T> +masked_reduction_cuda_shfls( + Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + x = binary_op(x, tmp); + } + return x; +} + +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v, T> +non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) { + if constexpr (ext::oneapi::experimental::is_fixed_size_group_v) { + return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0); + } else { + return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31); + } +} + +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v, T> +non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) { + T res; + if constexpr (std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, delta); + auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, delta); + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(res) + : "r"(tmp_a), "r"(tmp_b)); + } else { + auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; + auto tmp_b32 = non_uniform_shfl_T(MemberMask, input, delta); + res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + } + return res; +} + +// Opportunistic/Ballot group reduction using shfls +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_fixed_size_group_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + if (MemberMask == 0xffffffff) { + for (int i = 16; i > 0; i /= 2) { + auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + x = binary_op(x, tmp); + } + return x; + } + + unsigned localSetBit = g.get_local_id()[0] + 1; + + // number of elements requiring binary operations each loop iteration + auto opRange = g.get_local_range()[0]; + + // remainder that won't have a binary partner each loop iteration + int remainder; + + while (opRange / 2 >= 1) { + remainder = opRange % 2; + + // stride between local_ids forming a binary op + int stride = opRange / 2; + + // unfolded position of set bit in mask of shfl src lane + int unfoldedSrcSetBit = localSetBit + stride; + + // __nvvm_fns automatically wraps around to the correct bit position. + // There is no performance impact on src_set_bit position wrt localSetBit + auto tmp = non_uniform_shfl( + g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); + + if (!(localSetBit == 1 && remainder != 0)) { + x = binary_op(x, tmp); + } + + opRange = std::ceil((float)opRange / 2.0f); + } + int broadID; + int maskRev; + asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); + asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); + + return non_uniform_shfl(g, MemberMask, x, broadID); +} + +// Non Redux types must fall back to shfl based implementations. +template +std::enable_if_t< + std::is_same, std::false_type>::value && + ext::oneapi::experimental::is_user_constructed_group_v, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask); +} + +// get_identity is only currently used in this cuda specific header. If in the future it has more general use is should be moved to a more appropriate header. +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value || + sycl::detail::IsBitOR::value || + sycl::detail::IsBitXOR::value, + T> + get_identity() { + return 0; +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, T> + get_identity() { + return 1; +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, T> + get_identity() { + return ~0; +} + +#define GET_ID(OP_CHECK, OP) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, T> \ + get_identity() { \ + if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } \ + return 0; \ + } + +GET_ID(IsMinimum, max) +GET_ID(IsMaximum, min) + +#undef GET_ID + +//// Shuffle based masked reduction impls + +// Cluster group scan using shfls +template <__spv::GroupOperation Op, typename Group, typename T, + class BinaryOperation> +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_fixed_size_group_v, T> +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + for (int i = 1; i < g.get_local_range()[0]; i *= 2) { + auto tmp = + non_uniform_shfl(g, MemberMask, x, i); + if (g.get_local_id()[0] >= i) + x = binary_op(x, tmp); + } + if constexpr (Op == __spv::GroupOperation::ExclusiveScan) { + + x = non_uniform_shfl(g, MemberMask, x, 1); + if (g.get_local_id()[0] == 0) { + return get_identity(); + } + } + return x; +} + +template <__spv::GroupOperation Op, typename Group, typename T, + class BinaryOperation> +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_fixed_size_group_v, + T> +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + int localIdVal = g.get_local_id()[0]; + int localSetBit = localIdVal + 1; + + for (int i = 1; i < g.get_local_range()[0]; i *= 2) { + int unfoldedSrcSetBit = localSetBit - i; + + auto tmp = non_uniform_shfl( + g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); + if (localIdVal >= i) + x = binary_op(x, tmp); + } + if constexpr (Op == __spv::GroupOperation::ExclusiveScan) { + x = non_uniform_shfl(g, MemberMask, x, + __nvvm_fns(MemberMask, 0, localSetBit - 1)); + if (localIdVal == 0) { + return get_identity(); + } + } + return x; +} + +#endif +#endif +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl From 4188a17e21a43619c15db177e4687bdd53fe632a Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 07:44:12 -0700 Subject: [PATCH 08/22] is_fixed_size_group check for reduce/scan branch impls Signed-off-by: JackAKirk --- sycl/include/sycl/detail/type_traits.hpp | 6 ++++++ .../sycl/ext/oneapi/experimental/fixed_size_group.hpp | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index b6613ea080c03..ac1853e68ac6d 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -50,6 +50,12 @@ template inline constexpr bool is_user_constructed_group_v = is_user_constructed_group::value; +template struct is_fixed_size_group : std::false_type {}; + +template +inline constexpr bool is_fixed_size_group_v = + is_fixed_size_group::value; + namespace detail { template struct is_group_helper : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index de45d5ba3bdef..d51a18c3f2725 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -161,6 +161,10 @@ template struct is_user_constructed_group> : std::true_type {}; +template +struct is_fixed_size_group> + : std::true_type {}; + } // namespace ext::oneapi::experimental template From cf3d4e774a9303626cd26f27ebc4ae22c5b6dff2 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 07:47:33 -0700 Subject: [PATCH 09/22] cuda reduce/scans use non_uniform_algorithms.hpp Signed-off-by: JackAKirk --- sycl/include/sycl/group_algorithm.hpp | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 424a7800d7bb7..0d036aa4bc18e 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -204,8 +205,23 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) { std::is_same_v), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { +#if (__SYCL_CUDA_ARCH__ >= 800) + return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]); +#else + return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]); +#endif + } else { + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::Reduce>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", @@ -633,8 +649,19 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { std::is_same_v), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]); + } else { + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", @@ -862,8 +889,19 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { std::is_same_v), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]); + } else { + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", From cf55f581e45d29c35531ab61b67a9132e24776c4 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 07:57:27 -0700 Subject: [PATCH 10/22] Enabled cuda in algorithm tests. Signed-off-by: JackAKirk --- sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 2 +- sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp | 2 +- .../NonUniformGroups/opportunistic_group_algorithms.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index da27e89bc2458..03cb9e5ba6a7f 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t.out // // REQUIRES: gpu -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip #include #include diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index a338a6cd6f98a..d2ed6e97baf65 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t.out // // REQUIRES: gpu -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip #include #include diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp index ef364bf243425..93636a8156167 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t.out // // REQUIRES: gpu -// UNSUPPORTED: cuda || hip +// UNSUPPORTED: hip #include #include From c045fc5fb445ce82f199877ac73bbe92023c7201 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 19 May 2023 08:08:41 -0700 Subject: [PATCH 11/22] Added missing volatile. Signed-off-by: JackAKirk --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 6a072b1f21ece..763bb426ac033 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -246,8 +246,8 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, } int broadID; int maskRev; - asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); - asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); + asm volatile("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); + asm volatile("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); return non_uniform_shfl(g, MemberMask, x, broadID); } From 36e59bbb1193c61d1a259f6afd7b7297ecd4fa10 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 31 May 2023 14:22:01 +0100 Subject: [PATCH 12/22] Format and fixed sycl branch merge. Signed-off-by: JackAKirk --- .../cuda/non_uniform_algorithms.hpp | 66 +++++++++---------- .../sycl/ext/oneapi/sub_group_mask.hpp | 4 -- sycl/include/sycl/group_algorithm.hpp | 34 ++++------ 3 files changed, 43 insertions(+), 61 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 763bb426ac033..fb0b248ffc2f3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -25,8 +25,7 @@ using IsRedux = sycl::detail::IsMinimum::value || sycl::detail::IsMaximum::value>; -#ifdef __SYCL_DEVICE_ONLY__ -#if defined(__NVPTX__) +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) //// Masked reductions using redux.sync, requires integer types @@ -111,12 +110,12 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, T = double template -inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_fixed_size_group_v && - std::is_same_v, - T> -masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t && + std::is_same_v, + T> + masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { int x_a, x_b; asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); @@ -135,12 +134,12 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, T = float template -inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_fixed_size_group_v && - std::is_same_v, - T> -masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t && + std::is_same_v, + T> + masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { auto tmp = @@ -152,13 +151,12 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, std::is_integral_v template -inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_fixed_size_group_v && - std::is_integral_v, - T> -masked_reduction_cuda_shfls( - Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t && + std::is_integral_v, + T> + masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); @@ -235,8 +233,8 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // __nvvm_fns automatically wraps around to the correct bit position. // There is no performance impact on src_set_bit position wrt localSetBit - auto tmp = non_uniform_shfl( - g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); + auto tmp = non_uniform_shfl(g, MemberMask, x, + __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); if (!(localSetBit == 1 && remainder != 0)) { x = binary_op(x, tmp); @@ -263,7 +261,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask); } -// get_identity is only currently used in this cuda specific header. If in the future it has more general use is should be moved to a more appropriate header. +// get_identity is only currently used in this cuda specific header. If in the +// future it has more general use it should be moved to a more appropriate +// header. template inline __SYCL_ALWAYS_INLINE std::enable_if_t::value || @@ -327,13 +327,12 @@ GET_ID(IsMaximum, min) // Cluster group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_fixed_size_group_v, T> -masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE + std::enable_if_t, T> + masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = 1; i < g.get_local_range()[0]; i *= 2) { - auto tmp = - non_uniform_shfl(g, MemberMask, x, i); + auto tmp = non_uniform_shfl(g, MemberMask, x, i); if (g.get_local_id()[0] >= i) x = binary_op(x, tmp); } @@ -361,8 +360,8 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, for (int i = 1; i < g.get_local_range()[0]; i *= 2) { int unfoldedSrcSetBit = localSetBit - i; - auto tmp = non_uniform_shfl( - g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); + auto tmp = non_uniform_shfl(g, MemberMask, x, + __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit)); if (localIdVal >= i) x = binary_op(x, tmp); } @@ -376,8 +375,7 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, return x; } -#endif -#endif +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index cad2c8daf4242..d6365e2ca88f5 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -106,15 +106,11 @@ struct sub_group_mask { __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce, OCLMask); #else unsigned int count = 0; -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits)); -#else auto word = (Bits & valuable_bits(bits_num)); while (word) { word &= (word - 1); count++; } -#endif return count; #endif } diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 8a3284b3aa029..cdcb8145b190c 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -209,22 +209,18 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) { "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ #if defined(__NVPTX__) - sycl::vec MemberMask = - sycl::detail::ExtractMask(sycl::detail::GetMask(g)); if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); #if (__SYCL_CUDA_ARCH__ >= 800) return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]); #else return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]); #endif - } else { - return sycl::detail::calc<__spv::GroupOperation::Reduce>( - g, typename sycl::detail::GroupOpTag::type(), x, binary_op); } -#else +#endif return sycl::detail::calc<__spv::GroupOperation::Reduce>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); -#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", @@ -653,18 +649,14 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ #if defined(__NVPTX__) - sycl::vec MemberMask = - sycl::detail::ExtractMask(sycl::detail::GetMask(g)); if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { - return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]); - } else { - return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( - g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>( + g, x, binary_op, + sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]); } -#else +#endif return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); -#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", @@ -893,18 +885,14 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ #if defined(__NVPTX__) - sycl::vec MemberMask = - sycl::detail::ExtractMask(sycl::detail::GetMask(g)); if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { - return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]); - } else { - return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( - g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>( + g, x, binary_op, + sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]); } -#else +#endif return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); -#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", From f3c8665ab62d7c7db804ac7b757a5e3526ea13f5 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 31 May 2023 14:49:38 +0100 Subject: [PATCH 13/22] Format. Signed-off-by: JackAKirk --- sycl/include/sycl/detail/spirv.hpp | 15 +++++++++------ .../experimental/cuda/non_uniform_algorithms.hpp | 4 ++-- sycl/include/sycl/group_algorithm.hpp | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 0a65ba9e781ec..e6804ab589426 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -178,7 +178,7 @@ template bool GroupAny(Group, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { -#if defined (__NVPTX__) +#if defined(__NVPTX__) return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); #else // ballot_group partitions its parent into two groups (0 and 1) @@ -195,7 +195,7 @@ template bool GroupAny( ext::oneapi::experimental::fixed_size_group g, bool pred) { -#if defined (__NVPTX__) +#if defined(__NVPTX__) return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); #else // GroupNonUniformAny doesn't support cluster size, so use a reduction @@ -211,7 +211,7 @@ bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { } bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g, bool pred) { -#if defined (__NVPTX__) +#if defined(__NVPTX__) return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); #else return __spirv_GroupNonUniformAny( @@ -295,7 +295,8 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); #if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, + LocalId, 31); #else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; @@ -325,7 +326,8 @@ EnableIfNativeBroadcast GroupBroadcast( // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; #if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, + LocalId, 31); #else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; @@ -370,7 +372,8 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, // Remap local_id to its original numbering in sub-group auto LocalId = detail::IdToMaskPosition(g, local_id); #if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31); + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, + LocalId, 31); #else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index fb0b248ffc2f3..45799002f4372 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include #include -#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { - namespace detail { +namespace detail { template using IsRedux = diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index cdcb8145b190c..00c9d16263d1b 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { From 57c0bd9ecaac5f72ccd5b6c2b97934df4e887a3a Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 31 May 2023 14:53:38 +0100 Subject: [PATCH 14/22] Format. Signed-off-by: JackAKirk --- sycl/include/sycl/detail/type_traits.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index ac1853e68ac6d..df17a8e945d5d 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -53,8 +53,7 @@ inline constexpr bool is_user_constructed_group_v = template struct is_fixed_size_group : std::false_type {}; template -inline constexpr bool is_fixed_size_group_v = - is_fixed_size_group::value; +inline constexpr bool is_fixed_size_group_v = is_fixed_size_group::value; namespace detail { template struct is_group_helper : std::false_type {}; From 48785863f98f6ef230db2ea56df3300e5fcf3a2f Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 31 May 2023 15:01:10 +0100 Subject: [PATCH 15/22] Make Is_Redux nvptx only. Signed-off-by: JackAKirk --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 45799002f4372..932b4d88ce894 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -14,6 +14,7 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) template using IsRedux = @@ -25,8 +26,6 @@ using IsRedux = sycl::detail::IsMinimum::value || sycl::detail::IsMaximum::value>; -#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - //// Masked reductions using redux.sync, requires integer types template From 7aa585f734addb64a96d2bdb4ee83a8a5b64391d Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 31 May 2023 15:16:42 +0100 Subject: [PATCH 16/22] Added known_identity.hpp include. Signed-off-by: JackAKirk --- .../cuda/non_uniform_algorithms.hpp | 66 ++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 932b4d88ce894..64bf5b13c4eec 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -17,57 +18,49 @@ namespace detail { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) template -using IsRedux = - std::bool_constant::value && - sycl::detail::IsBitAND::value || - sycl::detail::IsBitOR::value || - sycl::detail::IsBitXOR::value || - sycl::detail::IsPlus::value || - sycl::detail::IsMinimum::value || - sycl::detail::IsMaximum::value>; +using IsRedux = std::bool_constant< + std::is_integral::value && IsBitAND::value || + IsBitOR::value || IsBitXOR::value || + IsPlus::value || IsMinimum::value || + IsMaximum::value>; //// Masked reductions using redux.sync, requires integer types template -std::enable_if_t::value && - sycl::detail::IsMinimum::value, - T> +std::enable_if_t< + is_sugeninteger::value && IsMinimum::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { return __nvvm_redux_sync_umin(x, MemberMask); } template -std::enable_if_t::value && - sycl::detail::IsMinimum::value, - T> +std::enable_if_t< + is_sigeninteger::value && IsMinimum::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { return __nvvm_redux_sync_min(x, MemberMask); } template -std::enable_if_t::value && - sycl::detail::IsMaximum::value, - T> +std::enable_if_t< + is_sugeninteger::value && IsMaximum::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { return __nvvm_redux_sync_umax(x, MemberMask); } template -std::enable_if_t::value && - sycl::detail::IsMaximum::value, - T> +std::enable_if_t< + is_sigeninteger::value && IsMaximum::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { return __nvvm_redux_sync_max(x, MemberMask); } template -std::enable_if_t<(sycl::detail::is_sugeninteger::value || - sycl::detail::is_sigeninteger::value) && - sycl::detail::IsPlus::value, +std::enable_if_t<(is_sugeninteger::value || is_sigeninteger::value) && + IsPlus::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -75,9 +68,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(sycl::detail::is_sugeninteger::value || - sycl::detail::is_sigeninteger::value) && - sycl::detail::IsBitAND::value, +std::enable_if_t<(is_sugeninteger::value || is_sigeninteger::value) && + IsBitAND::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -85,9 +77,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(sycl::detail::is_sugeninteger::value || - sycl::detail::is_sigeninteger::value) && - sycl::detail::IsBitOR::value, +std::enable_if_t<(is_sugeninteger::value || is_sigeninteger::value) && + IsBitOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -95,9 +86,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(sycl::detail::is_sugeninteger::value || - sycl::detail::is_sigeninteger::value) && - sycl::detail::IsBitXOR::value, +std::enable_if_t<(is_sugeninteger::value || is_sigeninteger::value) && + IsBitXOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -265,9 +255,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // header. template inline __SYCL_ALWAYS_INLINE - std::enable_if_t::value || - sycl::detail::IsBitOR::value || - sycl::detail::IsBitXOR::value, + std::enable_if_t::value || + IsBitOR::value || + IsBitXOR::value, T> get_identity() { return 0; @@ -275,14 +265,14 @@ inline __SYCL_ALWAYS_INLINE template inline __SYCL_ALWAYS_INLINE - std::enable_if_t::value, T> + std::enable_if_t::value, T> get_identity() { return 1; } template inline __SYCL_ALWAYS_INLINE - std::enable_if_t::value, T> + std::enable_if_t::value, T> get_identity() { return ~0; } @@ -290,7 +280,7 @@ inline __SYCL_ALWAYS_INLINE #define GET_ID(OP_CHECK, OP) \ template \ inline __SYCL_ALWAYS_INLINE \ - std::enable_if_t::value, T> \ + std::enable_if_t::value, T> \ get_identity() { \ if constexpr (std::is_same_v) { \ return std::numeric_limits::OP(); \ From a40e410914c763b99a9ba0fcd632b04f092704ce Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 1 Jun 2023 03:32:42 -0700 Subject: [PATCH 17/22] Addressed review comments. Signed-off-by: JackAKirk --- .../cuda/non_uniform_algorithms.hpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 64bf5b13c4eec..0911d5d1faab9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -97,7 +97,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, //// Shuffle based masked reduction impls -// Cluster group reduction using shfls, T = double +// fixed_size_group group reduction using shfls, T = double template inline __SYCL_ALWAYS_INLINE std::enable_if_t && @@ -121,7 +121,7 @@ inline __SYCL_ALWAYS_INLINE return x; } -// Cluster group reduction using shfls, T = float +// fixed_size_group group reduction using shfls, T = float template inline __SYCL_ALWAYS_INLINE std::enable_if_t && @@ -138,7 +138,7 @@ inline __SYCL_ALWAYS_INLINE return x; } -// Cluster group reduction using shfls, std::is_integral_v +// fixed_size_group group reduction using shfls, std::is_integral_v template inline __SYCL_ALWAYS_INLINE std::enable_if_t && @@ -157,31 +157,31 @@ inline __SYCL_ALWAYS_INLINE template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v, T> -non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) { +non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) { if constexpr (ext::oneapi::experimental::is_fixed_size_group_v) { - return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0); + return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0); } else { - return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31); + return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31); } } template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v, T> -non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) { +non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) { T res; if constexpr (std::is_same_v) { int x_a, x_b; asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, delta); - auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, delta); + auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, shfl_param); + auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, shfl_param); asm volatile("mov.b64 %0,{%1,%2}; \n\t" : "=l"(res) : "r"(tmp_a), "r"(tmp_b)); } else { auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; - auto tmp_b32 = non_uniform_shfl_T(MemberMask, input, delta); + auto tmp_b32 = non_uniform_shfl_T(MemberMask, input, shfl_param); res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; } return res; @@ -313,7 +313,7 @@ GET_ID(IsMaximum, min) //// Shuffle based masked reduction impls -// Cluster group scan using shfls +// fixed_size_group group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE From f20c936ac9f72651bc1cd603be3d7aa021d80177 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 1 Jun 2023 17:52:33 +0100 Subject: [PATCH 18/22] Removed unrequired includes. Signed-off-by: JackAKirk --- .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 0911d5d1faab9..c03eef3d1605a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -7,9 +7,6 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include -#include #include namespace sycl { From dcffcbeb664beac32999e5f421e93107a490d14e Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 1 Jun 2023 21:52:11 +0100 Subject: [PATCH 19/22] Removed breaking opportunistic_group include. forward decl of opportunistic_group is failing in spirv.hpp for nvptx so I removed nvptx specific usages from spirv.hpp Signed-off-by: JackAKirk --- sycl/include/sycl/detail/spirv.hpp | 41 --------------------------- sycl/include/sycl/group_algorithm.hpp | 25 ++++++++++++++++ 2 files changed, 25 insertions(+), 41 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index e6804ab589426..278c019269a7d 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -130,9 +129,6 @@ template bool GroupAll(Group, bool pred) { template bool GroupAll(ext::oneapi::experimental::ballot_group g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -141,21 +137,16 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAll(group_scope::value, pred); } -#endif } template bool GroupAll( ext::oneapi::experimental::fixed_size_group g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), static_cast(pred), PartitionSize); -#endif } template bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { @@ -164,12 +155,8 @@ bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else return __spirv_GroupNonUniformAll( group_scope::value, pred); -#endif } template bool GroupAny(Group, bool pred) { @@ -178,9 +165,6 @@ template bool GroupAny(Group, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -189,21 +173,16 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAny(group_scope::value, pred); } -#endif } template bool GroupAny( ext::oneapi::experimental::fixed_size_group g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), static_cast(pred), PartitionSize); -#endif } template bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { @@ -211,12 +190,8 @@ bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { } bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g, bool pred) { -#if defined(__NVPTX__) - return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred); -#else return __spirv_GroupNonUniformAny( group_scope::value, pred); -#endif } // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic @@ -294,10 +269,6 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); -#if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, - LocalId, 31); -#else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -317,7 +288,6 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); } -#endif } template EnableIfNativeBroadcast GroupBroadcast( @@ -325,10 +295,6 @@ EnableIfNativeBroadcast GroupBroadcast( T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; -#if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, - LocalId, 31); -#else // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -344,7 +310,6 @@ EnableIfNativeBroadcast GroupBroadcast( // partition, and it's unclear which will be faster in practice. return __spirv_GroupNonUniformShuffle(group_scope::value, OCLX, OCLId); -#endif } template EnableIfNativeBroadcast @@ -371,11 +336,6 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, IdT local_id) { // Remap local_id to its original numbering in sub-group auto LocalId = detail::IdToMaskPosition(g, local_id); -#if defined(__NVPTX__) - return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, - LocalId, 31); -#else - // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); using OCLT = detail::ConvertToOpenCLType_t; @@ -387,7 +347,6 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, return __spirv_GroupNonUniformBroadcast( group_scope::value, OCLX, OCLId); -#endif } template diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 00c9d16263d1b..1233b8bb4833a 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -387,6 +387,12 @@ template std::enable_if_t>, bool> any_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], + pred); + } +#endif return sycl::detail::spirv::GroupAny(g, pred); #else (void)g; @@ -427,6 +433,12 @@ template std::enable_if_t>, bool> all_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], + pred); + } +#endif return sycl::detail::spirv::GroupAll(g, pred); #else (void)g; @@ -467,6 +479,12 @@ template std::enable_if_t>, bool> none_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], + !pred); + } +#endif return sycl::detail::spirv::GroupAll(g, !pred); #else (void)g; @@ -585,6 +603,13 @@ std::enable_if_t<(is_group_v> && T> group_broadcast(Group g, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + auto LocalId = detail::IdToMaskPosition(g, local_id); + return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], + x, LocalId, 31); + } +#endif return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)g; From 9844fcba5b456fd024c42a6218029c1e0bbc6eba Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 2 Jun 2023 09:56:38 +0100 Subject: [PATCH 20/22] Revert unrequired changes. Signed-off-by: JackAKirk --- sycl/include/sycl/detail/spirv.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 278c019269a7d..48631bc800033 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -140,7 +140,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } template bool GroupAll( - ext::oneapi::experimental::fixed_size_group g, + ext::oneapi::experimental::fixed_size_group, bool pred) { // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( @@ -153,7 +153,7 @@ bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } -bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g, +bool GroupAll(const ext::oneapi::experimental::opportunistic_group &, bool pred) { return __spirv_GroupNonUniformAll( group_scope::value, pred); @@ -176,7 +176,7 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } template bool GroupAny( - ext::oneapi::experimental::fixed_size_group g, + ext::oneapi::experimental::fixed_size_group, bool pred) { // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr( @@ -188,7 +188,7 @@ template bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } -bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g, +bool GroupAny(const ext::oneapi::experimental::opportunistic_group &, bool pred) { return __spirv_GroupNonUniformAny( group_scope::value, pred); @@ -269,6 +269,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); + // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -295,6 +296,7 @@ EnableIfNativeBroadcast GroupBroadcast( T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; + // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); @@ -336,6 +338,8 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, IdT local_id) { // Remap local_id to its original numbering in sub-group auto LocalId = detail::IdToMaskPosition(g, local_id); + + // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); using OCLT = detail::ConvertToOpenCLType_t; From eea1d9ab0d66354034a3872936032ada47ee57c8 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 9 Jun 2023 21:25:17 +0100 Subject: [PATCH 21/22] Added missing types. Refactored impl. Signed-off-by: JackAKirk --- .../cuda/non_uniform_algorithms.hpp | 172 ++++++++---------- 1 file changed, 72 insertions(+), 100 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index c03eef3d1605a..c149dcae16618 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -94,58 +94,42 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, //// Shuffle based masked reduction impls -// fixed_size_group group reduction using shfls, T = double +// fixed_size_group group reduction using shfls template inline __SYCL_ALWAYS_INLINE - std::enable_if_t && - std::is_same_v, - T> - masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { - for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { - int x_a, x_b; - asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - - auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i); - auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i); - double tmp; - asm volatile("mov.b64 %0,{%1,%2}; \n\t" - : "=l"(tmp) - : "r"(tmp_a), "r"(tmp_b)); - x = binary_op(x, tmp); - } - - return x; -} - -// fixed_size_group group reduction using shfls, T = float -template -inline __SYCL_ALWAYS_INLINE - std::enable_if_t && - std::is_same_v, - T> - masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { - - for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { - auto tmp = - __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i); - x = binary_op(x, __nvvm_bitcast_i2f(tmp)); - } - return x; -} - -// fixed_size_group group reduction using shfls, std::is_integral_v -template -inline __SYCL_ALWAYS_INLINE - std::enable_if_t && - std::is_integral_v, - T> + std::enable_if_t, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { - for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { - auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + T tmp; + if constexpr (std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x)); + auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i); + auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i); + asm volatile("mov.b64 %0,{%1,%2};" : "=d"(tmp) : "r"(tmp_a), "r"(tmp_b)); + } else if constexpr (std::is_same_v || + std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x)); + auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i); + auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i); + asm volatile("mov.b64 %0,{%1,%2};" : "=l"(tmp) : "r"(tmp_a), "r"(tmp_b)); + } else if constexpr (std::is_same_v) { + short tmp_b16; + asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x)); + auto tmp_b32 = __nvvm_shfl_sync_bfly_i32( + MemberMask, static_cast(tmp_b16), -1, i); + asm volatile("mov.b16 %0,%1;" + : "=h"(tmp) + : "h"(static_cast(tmp_b32))); + } else if constexpr (std::is_same_v) { + auto tmp_b32 = + __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i); + tmp = __nvvm_bitcast_i2f(tmp_b32); + } else { + tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + } x = binary_op(x, tmp); } return x; @@ -169,17 +153,31 @@ non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) { T res; if constexpr (std::is_same_v) { int x_a, x_b; - asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - + asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x)); auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, shfl_param); auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, shfl_param); - asm volatile("mov.b64 %0,{%1,%2}; \n\t" - : "=l"(res) - : "r"(tmp_a), "r"(tmp_b)); + asm volatile("mov.b64 %0,{%1,%2};" : "=d"(res) : "r"(tmp_a), "r"(tmp_b)); + } else if constexpr (std::is_same_v || + std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x)); + auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, shfl_param); + auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, shfl_param); + asm volatile("mov.b64 %0,{%1,%2};" : "=l"(res) : "r"(tmp_a), "r"(tmp_b)); + } else if constexpr (std::is_same_v) { + short tmp_b16; + asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x)); + auto tmp_b32 = non_uniform_shfl_T( + MemberMask, static_cast(tmp_b16), shfl_param); + asm volatile("mov.b16 %0,%1;" + : "=h"(res) + : "h"(static_cast(tmp_b32))); + } else if constexpr (std::is_same_v) { + auto tmp_b32 = non_uniform_shfl_T(MemberMask, __nvvm_bitcast_f2i(x), + shfl_param); + res = __nvvm_bitcast_i2f(tmp_b32); } else { - auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; - auto tmp_b32 = non_uniform_shfl_T(MemberMask, input, shfl_param); - res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + res = non_uniform_shfl_T(MemberMask, x, shfl_param); } return res; } @@ -192,27 +190,18 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { - if (MemberMask == 0xffffffff) { - for (int i = 16; i > 0; i /= 2) { - auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); - x = binary_op(x, tmp); - } - return x; - } unsigned localSetBit = g.get_local_id()[0] + 1; // number of elements requiring binary operations each loop iteration auto opRange = g.get_local_range()[0]; - // remainder that won't have a binary partner each loop iteration - int remainder; - - while (opRange / 2 >= 1) { - remainder = opRange % 2; + // stride between local_ids forming a binary op + unsigned stride = opRange / 2; + while (stride >= 1) { - // stride between local_ids forming a binary op - int stride = opRange / 2; + // if (remainder == 1), there is a WI without a binary op partner + unsigned remainder = opRange % 2; // unfolded position of set bit in mask of shfl src lane int unfoldedSrcSetBit = localSetBit + stride; @@ -226,12 +215,15 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, x = binary_op(x, tmp); } - opRange = std::ceil((float)opRange / 2.0f); + opRange = stride + remainder; + stride = opRange / 2; } - int broadID; - int maskRev; - asm volatile("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); - asm volatile("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); + unsigned broadID; + asm volatile(".reg .u32 rev;\n\t" + "brev.b32 rev, %1;\n\t" // reverse mask bits + "clz.b32 %0, rev;" + : "=r"(broadID) + : "r"(MemberMask)); return non_uniform_shfl(g, MemberMask, x, broadID); } @@ -279,28 +271,7 @@ inline __SYCL_ALWAYS_INLINE inline __SYCL_ALWAYS_INLINE \ std::enable_if_t::value, T> \ get_identity() { \ - if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } else if constexpr (std::is_same_v) { \ - return std::numeric_limits::OP(); \ - } \ - return 0; \ + return std::numeric_limits::OP(); \ } GET_ID(IsMinimum, max) @@ -317,15 +288,16 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { + unsigned localIdVal = g.get_local_id()[0]; for (int i = 1; i < g.get_local_range()[0]; i *= 2) { auto tmp = non_uniform_shfl(g, MemberMask, x, i); - if (g.get_local_id()[0] >= i) + if (localIdVal >= i) x = binary_op(x, tmp); } if constexpr (Op == __spv::GroupOperation::ExclusiveScan) { x = non_uniform_shfl(g, MemberMask, x, 1); - if (g.get_local_id()[0] == 0) { + if (localIdVal == 0) { return get_identity(); } } @@ -340,8 +312,8 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { - int localIdVal = g.get_local_id()[0]; - int localSetBit = localIdVal + 1; + unsigned localIdVal = g.get_local_id()[0]; + unsigned localSetBit = localIdVal + 1; for (int i = 1; i < g.get_local_range()[0]; i *= 2) { int unfoldedSrcSetBit = localSetBit - i; From 686d11727028759b4b857b3968b6afbc408325b9 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 9 Jun 2023 21:41:49 +0100 Subject: [PATCH 22/22] is_fixed_size_group moved to detail namespace. Signed-off-by: JackAKirk --- sycl/include/sycl/detail/type_traits.hpp | 12 ++++++----- .../cuda/non_uniform_algorithms.hpp | 20 +++++++++---------- .../oneapi/experimental/fixed_size_group.hpp | 9 ++++++--- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index df17a8e945d5d..be072531a7a14 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -20,6 +20,13 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { +template struct is_fixed_size_group : std::false_type {}; + +template +inline constexpr bool is_fixed_size_group_v = is_fixed_size_group::value; +} // namespace detail + template class group; namespace ext::oneapi { struct sub_group; @@ -50,11 +57,6 @@ template inline constexpr bool is_user_constructed_group_v = is_user_constructed_group::value; -template struct is_fixed_size_group : std::false_type {}; - -template -inline constexpr bool is_fixed_size_group_v = is_fixed_size_group::value; - namespace detail { template struct is_group_helper : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index c149dcae16618..eea68d89a35fe 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -96,10 +96,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // fixed_size_group group reduction using shfls template -inline __SYCL_ALWAYS_INLINE - std::enable_if_t, T> - masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { T tmp; if constexpr (std::is_same_v) { @@ -139,7 +138,7 @@ template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v, T> non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) { - if constexpr (ext::oneapi::experimental::is_fixed_size_group_v) { + if constexpr (is_fixed_size_group_v) { return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0); } else { return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31); @@ -186,7 +185,7 @@ non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) { template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_fixed_size_group_v, + !is_fixed_size_group_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -284,10 +283,9 @@ GET_ID(IsMaximum, min) // fixed_size_group group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE - std::enable_if_t, T> - masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; for (int i = 1; i < g.get_local_range()[0]; i *= 2) { auto tmp = non_uniform_shfl(g, MemberMask, x, i); @@ -308,7 +306,7 @@ template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_fixed_size_group_v, + !is_fixed_size_group_v, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index d51a18c3f2725..cbc6be038f4ba 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -161,11 +161,14 @@ template struct is_user_constructed_group> : std::true_type {}; +} // namespace ext::oneapi::experimental + +namespace detail { template -struct is_fixed_size_group> +struct is_fixed_size_group< + ext::oneapi::experimental::fixed_size_group> : std::true_type {}; - -} // namespace ext::oneapi::experimental +} // namespace detail template struct is_group<