diff --git a/SYCL/NonUniformGroups/ballot_group.cpp b/SYCL/NonUniformGroups/ballot_group.cpp new file mode 100644 index 0000000000..955744b390 --- /dev/null +++ b/SYCL/NonUniformGroups/ballot_group.cpp @@ -0,0 +1,58 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = item.get_global_id() % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} diff --git a/SYCL/NonUniformGroups/cluster_group.cpp b/SYCL/NonUniformGroups/cluster_group.cpp new file mode 100644 index 0000000000..e1d7634191 --- /dev/null +++ b/SYCL/NonUniformGroups/cluster_group.cpp @@ -0,0 +1,62 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ClusterGroup = syclex::get_cluster_group(SG); + + bool Match = true; + Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); + Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); + Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); + Match &= (ClusterGroup.get_local_range() == ClusterSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ClusterGroup.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} diff --git a/SYCL/NonUniformGroups/is_fixed_topology.cpp b/SYCL/NonUniformGroups/is_fixed_topology.cpp new file mode 100644 index 0000000000..b3b6cd5ba4 --- /dev/null +++ b/SYCL/NonUniformGroups/is_fixed_topology.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +static_assert(syclex::is_fixed_topology_group_v); +#endif +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v); diff --git a/SYCL/NonUniformGroups/is_user_constructed.cpp b/SYCL/NonUniformGroups/is_user_constructed.cpp new file mode 100644 index 0000000000..a3f0085d8e --- /dev/null +++ b/SYCL/NonUniformGroups/is_user_constructed.cpp @@ -0,0 +1,14 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<1, sycl::sub_group>>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v); diff --git a/SYCL/NonUniformGroups/opportunistic_group.cpp b/SYCL/NonUniformGroups/opportunistic_group.cpp new file mode 100644 index 0000000000..925340cee1 --- /dev/null +++ b/SYCL/NonUniformGroups/opportunistic_group.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Due to the unpredictable runtime behavior of opportunistic groups, + // some values may change from run to run. Check they're in expected + // ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); + + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < 32; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } + } + } + assert(NumLeaders > 0); + return 0; +} diff --git a/SYCL/NonUniformGroups/tangle_group.cpp b/SYCL/NonUniformGroups/tangle_group.cpp new file mode 100644 index 0000000000..172a73ebdc --- /dev/null +++ b/SYCL/NonUniformGroups/tangle_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items via control flow + // Branches deliberately duplicated to test impact of optimizations + // This only reliably works with optimizations disabled right now + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +}