[SYCL] Add basic tests for non-uniform groups

Pennycook · Pennycook · commit 0dc42423036f · 2023-02-16T11:32:40.000-08:00
Tests the ability to create an instance of each new group type,
and the correctness of the core member functions.

Signed-off-by: John Pennycook &lt;john.pennycook@intel.com&gt;
diff --git a/SYCL/NonUniformGroups/ballot_group.cpp b/SYCL/NonUniformGroups/ballot_group.cpp
@@ -0,0 +1,59 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items
+          bool Predicate = item.get_global_id() % 2 == 0;
+          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+          // Check function return values match Predicate
+          bool Match = true;
+          auto GroupID = (Predicate) ? 1 : 0;
+          Match &= (BallotGroup.get_group_id() == GroupID);
+          Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2);
+          Match &= (BallotGroup.get_group_range() == 2);
+          Match &= (BallotGroup.get_local_range() == 16);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = BallotGroup.leader();
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}
diff --git a/SYCL/NonUniformGroups/cluster_group.cpp b/SYCL/NonUniformGroups/cluster_group.cpp
@@ -0,0 +1,63 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+template <size_t ClusterSize> class TestKernel;
+
+template <size_t ClusterSize> void test() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          auto ClusterGroup = syclex::get_cluster_group<ClusterSize>(SG);
+
+          bool Match = true;
+          Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize));
+          Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize));
+          Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize));
+          Match &= (ClusterGroup.get_local_range() == ClusterSize);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = ClusterGroup.leader();
+        };
+    CGH.parallel_for<TestKernel<ClusterSize>>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0));
+  }
+}
+
+int main() {
+  test<1>();
+  test<2>();
+  test<4>();
+  test<8>();
+  test<16>();
+  test<32>();
+  return 0;
+}
diff --git a/SYCL/NonUniformGroups/is_fixed_topology.cpp b/SYCL/NonUniformGroups/is_fixed_topology.cpp
@@ -0,0 +1,12 @@
+// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group>);
+#endif
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<1>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<2>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<3>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::sub_group>);
diff --git a/SYCL/NonUniformGroups/is_user_constructed.cpp b/SYCL/NonUniformGroups/is_user_constructed.cpp
@@ -0,0 +1,14 @@
+// RUN: %clangxx -fsycl fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::ballot_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<1, sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<2, sycl::sub_group>>);
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::tangle_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<syclex::opportunistic_group>);
diff --git a/SYCL/NonUniformGroups/opportunistic_group.cpp b/SYCL/NonUniformGroups/opportunistic_group.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Due to the unpredictable runtime behavior of opportunistic groups,
+          // some values may change from run to run. Check they're in expected
+          // ranges and consistent with other groups.
+          if (item.get_global_id() % 2 == 0) {
+            auto OpportunisticGroup =
+                syclex::this_kernel::get_opportunistic_group();
+
+            bool Match = true;
+            Match &= (OpportunisticGroup.get_group_id() == 0);
+            Match &= (OpportunisticGroup.get_local_id() <
+                      OpportunisticGroup.get_local_range());
+            Match &= (OpportunisticGroup.get_group_range() == 1);
+            Match &= (OpportunisticGroup.get_local_linear_range() <=
+                      SG.get_local_linear_range());
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = OpportunisticGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  uint32_t NumLeaders = 0;
+  for (int WI = 0; WI < 32; ++WI) {
+    if (WI % 2 == 0) {
+      assert(MatchAcc[WI] == true);
+      if (LeaderAcc[WI]) {
+        NumLeaders++;
+      }
+    }
+  }
+  assert(NumLeaders > 0);
+  return 0;
+}
diff --git a/SYCL/NonUniformGroups/tangle_group.cpp b/SYCL/NonUniformGroups/tangle_group.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items via control flow
+          // Branches deliberately duplicated to test impact of optimizations
+          if (item.get_global_id() % 2 == 0) {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          } else {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}