From bb39b47e125b1ccee03e0ee9c6e92e40d6688945 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 2 May 2023 08:42:25 -0700
Subject: [PATCH 01/22] Basic cuda support for opportunistic_group,
 fixed_size_group, and ballot_group.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 libclc/ptx-nvidiacl/libspirv/SOURCES          |  2 +-
 .../{group_ballot.cl => group_non_uniform.cl} |  7 +++++
 .../ext/oneapi/experimental/ballot_group.hpp  |  4 +--
 .../oneapi/experimental/fixed_size_group.hpp  | 26 +++++++++++++++++++
 .../experimental/non_uniform_groups.hpp       |  7 ++++-
 .../experimental/opportunistic_group.hpp      | 12 ++++++---
 .../NonUniformGroups/ballot_group.cpp         |  2 +-
 .../NonUniformGroups/fixed_size_group.cpp     |  2 +-
 .../NonUniformGroups/opportunistic_group.cpp  |  2 +-
 9 files changed, 53 insertions(+), 11 deletions(-)
 rename libclc/ptx-nvidiacl/libspirv/group/{group_ballot.cl => group_non_uniform.cl} (80%)
diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES
index bec378d428511..4177aae12b416 100644
--- a/libclc/ptx-nvidiacl/libspirv/SOURCES
+++ b/libclc/ptx-nvidiacl/libspirv/SOURCES
@@ -93,7 +93,7 @@ images/image_helpers.ll
 images/image.cl
 group/collectives_helpers.ll
 group/collectives.cl
-group/group_ballot.cl
+group/group_non_uniform.cl
 atomic/atomic_add.cl
 atomic/atomic_and.cl
 atomic/atomic_cmpxchg.cl
diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
similarity index 80%
rename from libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl
rename to libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
index 33285028b7b39..50826d9bf53e2 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "membermask.h"
+#include <integer/popcount.h>
 
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
@@ -34,3 +35,9 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) {
 
   return res;
 }
+
+_CLC_DEF _CLC_CONVERGENT uint
+_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(
+    uint scope, uint flag, __clc_vec4_uint32_t mask) {
+  return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
+}
diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index fcdce42652075..f1022db9ebedb 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -121,8 +121,8 @@ template <typename ParentGroup> class ballot_group {
   friend ballot_group<ParentGroup>
   get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);
 
-  friend uint32_t sycl::detail::IdToMaskPosition<ballot_group<ParentGroup>>(
-      ballot_group<ParentGroup> Group, uint32_t Id);
+  friend sub_group_mask sycl::detail::GetMask<ballot_group<ParentGroup>>(
+      ballot_group<ParentGroup> Group);
 };
 
 template <typename Group>
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index 3c2a1b07b74d7..ea8c9e010ae04 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -112,10 +112,23 @@ template <size_t PartitionSize, typename ParentGroup> class fixed_size_group {
   }
 
 protected:
+
+#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  sub_group_mask Mask;
+#endif
+
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  fixed_size_group(ext::oneapi::sub_group_mask mask):Mask(mask) {}
+#else
   fixed_size_group() {}
+#endif
 
   friend fixed_size_group<PartitionSize, ParentGroup>
   get_fixed_size_group<PartitionSize, ParentGroup>(ParentGroup g);
+
+  friend sub_group_mask
+  sycl::detail::GetMask<fixed_size_group<PartitionSize, ParentGroup>>(
+      fixed_size_group<PartitionSize, ParentGroup> Group);
 };
 
 template <size_t PartitionSize, typename Group>
@@ -125,7 +138,20 @@ inline std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
 get_fixed_size_group(Group group) {
   (void)group;
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  uint32_t loc_id = group.get_local_linear_id();
+  uint32_t loc_size = group.get_local_linear_range();
+  uint32_t bits = PartitionSize == 32
+                      ? 0xffffffff
+                      : ((1 << PartitionSize) - 1)
+                            << ((loc_id / PartitionSize) * PartitionSize);
+
+  return fixed_size_group<PartitionSize, sycl::sub_group>(
+      sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
+          bits, loc_size));
+#else
   return fixed_size_group<PartitionSize, sycl::sub_group>();
+#endif
 #else
   throw runtime_error("Non-uniform groups are not supported on host device.",
                       PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 0c31f9ad2290f..5e7befde49a6e 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -39,10 +39,15 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
 }
 #endif
 
+template <typename NonUniformGroup>
+inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) {
+  return Group.Mask;
+}
+
 template <typename NonUniformGroup>
 inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
   // TODO: This will need to be optimized
-  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Group.Mask);
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(GetMask(Group));
   uint32_t Count = 0;
   for (int i = 0; i < 4; ++i) {
     for (int b = 0; b < 32; ++b) {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index a1c08a35b399b..b51c8f02ab8dc 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -118,9 +118,8 @@ class opportunistic_group {
 
   friend opportunistic_group this_kernel::get_opportunistic_group();
 
-  friend uint32_t
-  sycl::detail::IdToMaskPosition<opportunistic_group>(opportunistic_group Group,
-                                                      uint32_t Id);
+  friend sub_group_mask
+  sycl::detail::GetMask<opportunistic_group>(opportunistic_group Group);
 };
 
 namespace this_kernel {
@@ -133,7 +132,12 @@ inline opportunistic_group get_opportunistic_group() {
   sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true);
   return opportunistic_group(mask);
 #elif defined(__NVPTX__)
-  // TODO: Construct from __activemask
+  uint32_t active_mask;
+  asm volatile("activemask.b32 %0;" : "=r"(active_mask));
+  sub_group_mask mask =
+    sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
+        active_mask, 32);
+  return opportunistic_group(mask);
 #endif
 #else
   throw runtime_error("Non-uniform groups are not supported on host device.",
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
index 98fd7174208e5..86abbf0906224 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
-// UNSUPPORTED: cpu || cuda || hip
+// UNSUPPORTED: cpu || hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
index b4f7e96531076..b3d2e21f7eb17 100644
--- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
-// UNSUPPORTED: cpu || cuda || hip
+// UNSUPPORTED: cpu || hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
index 925340cee1c6d..67b68ec31289b 100644
--- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
-// UNSUPPORTED: cpu || cuda || hip
+// UNSUPPORTED: cpu || hip
 
 #include <sycl/sycl.hpp>
 #include <vector>

From 4ca058c1d206094282eb6e38bf854717d7ebd5d4 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 5 May 2023 12:25:54 -0700
Subject: [PATCH 02/22] Fix test failure, add comment in libclc.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl    | 2 ++
 sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp | 4 ++--
 sycl/include/sycl/ext/oneapi/sub_group_mask.hpp            | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
index 50826d9bf53e2..7e5f81850c352 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
@@ -39,5 +39,7 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) {
 _CLC_DEF _CLC_CONVERGENT uint
 _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(
     uint scope, uint flag, __clc_vec4_uint32_t mask) {
+  // here we assume scope == __spv::Scope::Subgroup && flag ==
+  // (int)__spv::GroupOperation::ExclusiveScan which is the only current use case
   return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
 }
diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
index cc19edc820f70..e3e8a077edf2d 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
@@ -119,8 +119,8 @@ template <typename ParentGroup> class tangle_group {
 
   friend tangle_group<ParentGroup> get_tangle_group<ParentGroup>(ParentGroup);
 
-  friend uint32_t sycl::detail::IdToMaskPosition<tangle_group<ParentGroup>>(
-      tangle_group<ParentGroup> Group, uint32_t Id);
+  friend sub_group_mask sycl::detail::GetMask<tangle_group<ParentGroup>>(
+      tangle_group<ParentGroup> Group);
 };
 
 template <typename Group>
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index af86bdafae9d6..90e7d85e9ef69 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -94,11 +94,15 @@ struct sub_group_mask {
   bool none() const { return count() == 0; }
   uint32_t count() const {
     unsigned int count = 0;
+    #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+    asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits));
+    #else
     auto word = (Bits & valuable_bits(bits_num));
     while (word) {
       word &= (word - 1);
       count++;
     }
+    #endif
     return count;
   }
   uint32_t size() const { return bits_num; }

From 369f25f48c9672973cf5016cd0e14a1bf640b4ff Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 9 May 2023 14:27:48 -0700
Subject: [PATCH 03/22] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../sycl/ext/oneapi/experimental/fixed_size_group.hpp       | 4 ++--
 .../sycl/ext/oneapi/experimental/opportunistic_group.hpp    | 4 ++--
 sycl/include/sycl/ext/oneapi/sub_group_mask.hpp             | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index ea8c9e010ae04..bd4de46945766 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -113,12 +113,12 @@ template <size_t PartitionSize, typename ParentGroup> class fixed_size_group {
 
 protected:
 
-#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sub_group_mask Mask;
 #endif
 
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  fixed_size_group(ext::oneapi::sub_group_mask mask):Mask(mask) {}
+  fixed_size_group(ext::oneapi::sub_group_mask mask) : Mask(mask) {}
 #else
   fixed_size_group() {}
 #endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index b51c8f02ab8dc..016aed88b555c 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -135,8 +135,8 @@ inline opportunistic_group get_opportunistic_group() {
   uint32_t active_mask;
   asm volatile("activemask.b32 %0;" : "=r"(active_mask));
   sub_group_mask mask =
-    sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
-        active_mask, 32);
+      sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
+          active_mask, 32);
   return opportunistic_group(mask);
 #endif
 #else
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index 90e7d85e9ef69..f1bfe5aec20c2 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -94,15 +94,15 @@ struct sub_group_mask {
   bool none() const { return count() == 0; }
   uint32_t count() const {
     unsigned int count = 0;
-    #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
     asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits));
-    #else
+#else
     auto word = (Bits & valuable_bits(bits_num));
     while (word) {
       word &= (word - 1);
       count++;
     }
-    #endif
+#endif
     return count;
   }
   uint32_t size() const { return bits_num; }

From f443f8170903517d1d4f954356038c0fb7670f09 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 9 May 2023 14:33:23 -0700
Subject: [PATCH 04/22] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index bd4de46945766..de45d5ba3bdef 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -112,7 +112,6 @@ template <size_t PartitionSize, typename ParentGroup> class fixed_size_group {
   }
 
 protected:
-
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sub_group_mask Mask;
 #endif

From 3069a1e8fee7de23554c69537a18e05afd067163 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 03:33:25 -0700
Subject: [PATCH 05/22] Optimized `IdToMaskPosition` NVPTX case.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../sycl/ext/oneapi/experimental/non_uniform_groups.hpp     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 5e7befde49a6e..b099787f0fa25 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -46,8 +46,11 @@ inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) {
 
 template <typename NonUniformGroup>
 inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
-  // TODO: This will need to be optimized
   sycl::vec<unsigned, 4> MemberMask = ExtractMask(GetMask(Group));
+#if defined(__NVPTX__)
+  return __nvvm_fns(MemberMask[0], 0, Id + 1);
+#else
+  // TODO: This will need to be optimized
   uint32_t Count = 0;
   for (int i = 0; i < 4; ++i) {
     for (int b = 0; b < 32; ++b) {
@@ -60,6 +63,7 @@ inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
     }
   }
   return Count;
+#endif
 }
 
 } // namespace detail

From 0b1b82a4fb5052c8a034bced96b19ce38ac951a9 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 07:27:08 -0700
Subject: [PATCH 06/22] barrier, broadcast, any_of, all_of, none_of impls

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/spirv.hpp | 58 +++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 85cb4834552d4..0a65ba9e781ec 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -16,6 +16,7 @@
 #include <sycl/detail/type_list.hpp>
 #include <sycl/detail/type_traits.hpp>
 #include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
+#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
 #include <sycl/id.hpp>
 #include <sycl/memory_enums.hpp>
 
@@ -129,6 +130,9 @@ template <typename Group> bool GroupAll(Group, bool pred) {
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
+#if defined(__NVPTX__)
+  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -137,26 +141,35 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
   }
+#endif
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAll(
-    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
+#if defined(__NVPTX__)
+  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   // GroupNonUniformAll doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseAnd(
       group_scope<ParentGroup>::value,
       static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
       static_cast<uint32_t>(pred), PartitionSize);
+#endif
 }
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
   return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
 }
-template <typename Group>
-bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
+
+bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g,
               bool pred) {
+#if defined(__NVPTX__)
+  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   return __spirv_GroupNonUniformAll(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
+#endif
 }
 
 template <typename Group> bool GroupAny(Group, bool pred) {
@@ -165,6 +178,9 @@ template <typename Group> bool GroupAny(Group, bool pred) {
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
+#if defined (__NVPTX__)
+  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -173,25 +189,34 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
   }
+#endif
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAny(
-    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
+#if defined (__NVPTX__)
+  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   // GroupNonUniformAny doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseOr(
       group_scope<ParentGroup>::value,
       static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
       static_cast<uint32_t>(pred), PartitionSize);
+#endif
 }
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
   return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
 }
-bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
+bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g,
               bool pred) {
+#if defined (__NVPTX__)
+  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
+#else
   return __spirv_GroupNonUniformAny(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
+#endif
 }
 
 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
@@ -269,7 +294,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
                T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup.
   auto LocalId = detail::IdToMaskPosition(g, local_id);
-
+#if defined(__NVPTX__)
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+#else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -289,6 +316,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
                                             OCLX, OCLId);
   }
+#endif
 }
 template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
@@ -296,7 +324,9 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
     T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup
   auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
-
+#if defined(__NVPTX__)
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+#else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -312,6 +342,7 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
   // partition, and it's unclear which will be faster in practice.
   return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
                                         OCLId);
+#endif
 }
 template <typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
@@ -338,7 +369,9 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
                IdT local_id) {
   // Remap local_id to its original numbering in sub-group
   auto LocalId = detail::IdToMaskPosition(g, local_id);
-
+#if defined(__NVPTX__)
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+#else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -351,6 +384,7 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
   return __spirv_GroupNonUniformBroadcast(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, OCLX,
       OCLId);
+#endif
 }
 
 template <typename Group, typename T, typename IdT>
@@ -1022,8 +1056,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
 template <typename Group>
 typename std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>>
-ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
-#if defined(__SPIR__)
+ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
+#if defined(__NVPTX__)
+  __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
+#else
   // SPIR-V does not define an instruction to synchronize partial groups.
   // However, most (possibly all?) of the current SPIR-V targets execute
   // work-items in lockstep, so we can probably get away with a MemoryBarrier.
@@ -1033,8 +1069,6 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
                             __spv::MemorySemanticsMask::SubgroupMemory |
                             __spv::MemorySemanticsMask::WorkgroupMemory |
                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
-#elif defined(__NVPTX__)
-  // TODO: Call syncwarp with appropriate mask extracted from the group
 #endif
 }
 

From 5a9668b3ed3f572208392fe75f41b519e23a35fd Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 07:41:53 -0700
Subject: [PATCH 07/22] reduce/scan impls.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../cuda/non_uniform_algorithms.hpp           | 383 ++++++++++++++++++
 1 file changed, 383 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
new file mode 100644
index 0000000000000..6a072b1f21ece
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -0,0 +1,383 @@
+//==----- non_uniform_algorithms.hpp - cuda masked subgroup algorithms -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/detail/type_traits.hpp>
+#include <sycl/group.hpp>
+#include <sycl/detail/spirv.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+  namespace detail {
+
+template <typename T, class BinaryOperation>
+using IsRedux =
+    std::bool_constant<std::is_integral<T>::value &&
+                           sycl::detail::IsBitAND<T, BinaryOperation>::value ||
+                       sycl::detail::IsBitOR<T, BinaryOperation>::value ||
+                       sycl::detail::IsBitXOR<T, BinaryOperation>::value ||
+                       sycl::detail::IsPlus<T, BinaryOperation>::value ||
+                       sycl::detail::IsMinimum<T, BinaryOperation>::value ||
+                       sycl::detail::IsMaximum<T, BinaryOperation>::value>;
+
+#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+
+//// Masked reductions using redux.sync, requires integer types
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
+                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umin(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
+                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_min(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
+                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umax(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
+                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_max(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsPlus<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_add(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitAND<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_and(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_or(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_xor(x, MemberMask);
+}
+////
+
+//// Shuffle based masked reduction impls
+
+// Cluster group reduction using shfls, T = double
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+        std::is_same_v<T, double>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+    auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+    double tmp;
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(tmp)
+                 : "r"(tmp_a), "r"(tmp_b));
+    x = binary_op(x, tmp);
+  }
+
+  return x;
+}
+
+// Cluster group reduction using shfls, T = float
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+        std::is_same_v<T, float>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    auto tmp =
+        __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
+    x = binary_op(x, __nvvm_bitcast_i2f(tmp));
+  }
+  return x;
+}
+
+// Cluster group reduction using shfls, std::is_integral_v<T>
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+        std::is_integral_v<T>,
+    T>
+masked_reduction_cuda_shfls(
+    Group g, T x, BinaryOperation binary_op,
+    const uint32_t MemberMask) {
+
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+    x = binary_op(x, tmp);
+  }
+  return x;
+}
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) {
+  if constexpr (ext::oneapi::experimental::is_fixed_size_group_v<Group>) {
+    return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0);
+  } else {
+    return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31);
+  }
+}
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) {
+  T res;
+  if constexpr (std::is_same_v<T, double>) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, delta);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, delta);
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(res)
+                 : "r"(tmp_a), "r"(tmp_b));
+  } else {
+    auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
+    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, input, delta);
+    res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+  }
+  return res;
+}
+
+// Opportunistic/Ballot group reduction using shfls
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !ext::oneapi::experimental::is_fixed_size_group_v<Group>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+  if (MemberMask == 0xffffffff) {
+    for (int i = 16; i > 0; i /= 2) {
+      auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+      x = binary_op(x, tmp);
+    }
+    return x;
+  }
+
+  unsigned localSetBit = g.get_local_id()[0] + 1;
+
+  // number of elements requiring binary operations each loop iteration
+  auto opRange = g.get_local_range()[0];
+
+  // remainder that won't have a binary partner each loop iteration
+  int remainder;
+
+  while (opRange / 2 >= 1) {
+    remainder = opRange % 2;
+
+    // stride between local_ids forming a binary op
+    int stride = opRange / 2;
+
+    // unfolded position of set bit in mask of shfl src lane
+    int unfoldedSrcSetBit = localSetBit + stride;
+
+    // __nvvm_fns automatically wraps around to the correct bit position.
+    // There is no performance impact on src_set_bit position wrt localSetBit
+    auto tmp = non_uniform_shfl(
+        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+
+    if (!(localSetBit == 1 && remainder != 0)) {
+      x = binary_op(x, tmp);
+    }
+
+    opRange = std::ceil((float)opRange / 2.0f);
+  }
+  int broadID;
+  int maskRev;
+  asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
+  asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
+
+  return non_uniform_shfl(g, MemberMask, x, broadID);
+}
+
+// Non Redux types must fall back to shfl based implementations.
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    std::is_same<IsRedux<T, BinaryOperation>, std::false_type>::value &&
+        ext::oneapi::experimental::is_user_constructed_group_v<Group>,
+    T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
+}
+
+// get_identity is only currently used in this cuda specific header. If in the future it has more general use is should be moved to a more appropriate header.
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsPlus<T, BinaryOperation>::value ||
+                         sycl::detail::IsBitOR<T, BinaryOperation>::value ||
+                         sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+                     T>
+    get_identity() {
+  return 0;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsMultiplies<T, BinaryOperation>::value, T>
+    get_identity() {
+  return 1;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsBitAND<T, BinaryOperation>::value, T>
+    get_identity() {
+  return ~0;
+}
+
+#define GET_ID(OP_CHECK, OP)                                                   \
+  template <typename T, class BinaryOperation>                                 \
+  inline __SYCL_ALWAYS_INLINE                                                  \
+      std::enable_if_t<sycl::detail::OP_CHECK<T, BinaryOperation>::value, T>   \
+      get_identity() {                                                         \
+    if constexpr (std::is_same_v<T, char>) {                                   \
+      return std::numeric_limits<char>::OP();                                  \
+    } else if constexpr (std::is_same_v<T, unsigned char>) {                   \
+      return std::numeric_limits<unsigned char>::OP();                         \
+    } else if constexpr (std::is_same_v<T, short>) {                           \
+      return std::numeric_limits<short>::OP();                                 \
+    } else if constexpr (std::is_same_v<T, unsigned short>) {                  \
+      return std::numeric_limits<unsigned short>::OP();                        \
+    } else if constexpr (std::is_same_v<T, int>) {                             \
+      return std::numeric_limits<int>::OP();                                   \
+    } else if constexpr (std::is_same_v<T, unsigned int>) {                    \
+      return std::numeric_limits<unsigned int>::OP();                          \
+    } else if constexpr (std::is_same_v<T, long>) {                            \
+      return std::numeric_limits<int>::OP();                                   \
+    } else if constexpr (std::is_same_v<T, unsigned long>) {                   \
+      return std::numeric_limits<unsigned int>::OP();                          \
+    } else if constexpr (std::is_same_v<T, float>) {                           \
+      return std::numeric_limits<float>::OP();                                 \
+    } else if constexpr (std::is_same_v<T, double>) {                          \
+      return std::numeric_limits<double>::OP();                                \
+    }                                                                          \
+    return 0;                                                                  \
+  }
+
+GET_ID(IsMinimum, max)
+GET_ID(IsMaximum, min)
+
+#undef GET_ID
+
+//// Shuffle based masked reduction impls
+
+// Cluster group scan using shfls
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                       const uint32_t MemberMask) {
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    auto tmp =
+        non_uniform_shfl(g, MemberMask, x, i);
+    if (g.get_local_id()[0] >= i)
+      x = binary_op(x, tmp);
+  }
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
+
+    x = non_uniform_shfl(g, MemberMask, x, 1);
+    if (g.get_local_id()[0] == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
+}
+
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !ext::oneapi::experimental::is_fixed_size_group_v<Group>,
+    T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                       const uint32_t MemberMask) {
+  int localIdVal = g.get_local_id()[0];
+  int localSetBit = localIdVal + 1;
+
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    int unfoldedSrcSetBit = localSetBit - i;
+
+    auto tmp = non_uniform_shfl(
+        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+    if (localIdVal >= i)
+      x = binary_op(x, tmp);
+  }
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
+    x = non_uniform_shfl(g, MemberMask, x,
+                         __nvvm_fns(MemberMask, 0, localSetBit - 1));
+    if (localIdVal == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
+}
+
+#endif
+#endif
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl

From 4188a17e21a43619c15db177e4687bdd53fe632a Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 07:44:12 -0700
Subject: [PATCH 08/22] is_fixed_size_group check for reduce/scan branch impls

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/type_traits.hpp                    | 6 ++++++
 .../sycl/ext/oneapi/experimental/fixed_size_group.hpp       | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index b6613ea080c03..ac1853e68ac6d 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -50,6 +50,12 @@ template <class T>
 inline constexpr bool is_user_constructed_group_v =
     is_user_constructed_group<T>::value;
 
+template <class T> struct is_fixed_size_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_fixed_size_group_v =
+    is_fixed_size_group<T>::value;
+
 namespace detail {
 template <typename T> struct is_group_helper : std::false_type {};
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index de45d5ba3bdef..d51a18c3f2725 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -161,6 +161,10 @@ template <size_t PartitionSize, typename ParentGroup>
 struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>
     : std::true_type {};
 
+template <size_t PartitionSize, typename ParentGroup>
+struct is_fixed_size_group<fixed_size_group<PartitionSize, ParentGroup>>
+    : std::true_type {};
+
 } // namespace ext::oneapi::experimental
 
 template <size_t PartitionSize, typename ParentGroup>

From cf3d4e774a9303626cd26f27ebc4ae22c5b6dff2 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 07:47:33 -0700
Subject: [PATCH 09/22] cuda reduce/scans use non_uniform_algorithms.hpp

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/group_algorithm.hpp | 38 +++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 424a7800d7bb7..0d036aa4bc18e 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -22,6 +22,7 @@
 #include <sycl/known_identity.hpp>
 #include <sycl/nd_item.hpp>
 #include <sycl/sub_group.hpp>
+#include <sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -204,8 +205,23 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
            std::is_same_v<decltype(binary_op(x, x)), float>),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+#if (__SYCL_CUDA_ARCH__ >= 800)
+    return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]);
+#else
+    return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]);
+#endif
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::Reduce>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -633,8 +649,19 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                      std::is_same_v<decltype(binary_op(x, x)), float>),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]);
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -862,8 +889,19 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                      std::is_same_v<decltype(binary_op(x, x)), float>),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]);
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",

From cf55f581e45d29c35531ab61b67a9132e24776c4 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 07:57:27 -0700
Subject: [PATCH 10/22] Enabled cuda in algorithm tests.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp      | 2 +-
 sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp  | 2 +-
 .../NonUniformGroups/opportunistic_group_algorithms.cpp         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
index da27e89bc2458..03cb9e5ba6a7f 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
index a338a6cd6f98a..d2ed6e97baf65 100644
--- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
index ef364bf243425..93636a8156167 100644
--- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>

From c045fc5fb445ce82f199877ac73bbe92023c7201 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 19 May 2023 08:08:41 -0700
Subject: [PATCH 11/22] Added missing volatile.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 6a072b1f21ece..763bb426ac033 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -246,8 +246,8 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   }
   int broadID;
   int maskRev;
-  asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
-  asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
+  asm volatile("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
+  asm volatile("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
 
   return non_uniform_shfl(g, MemberMask, x, broadID);
 }

From 36e59bbb1193c61d1a259f6afd7b7297ecd4fa10 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 31 May 2023 14:22:01 +0100
Subject: [PATCH 12/22] Format and fixed sycl branch merge.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../cuda/non_uniform_algorithms.hpp           | 66 +++++++++----------
 .../sycl/ext/oneapi/sub_group_mask.hpp        |  4 --
 sycl/include/sycl/group_algorithm.hpp         | 34 ++++------
 3 files changed, 43 insertions(+), 61 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 763bb426ac033..fb0b248ffc2f3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -25,8 +25,7 @@ using IsRedux =
                        sycl::detail::IsMinimum<T, BinaryOperation>::value ||
                        sycl::detail::IsMaximum<T, BinaryOperation>::value>;
 
-#ifdef __SYCL_DEVICE_ONLY__
-#if defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 
 //// Masked reductions using redux.sync, requires integer types
 
@@ -111,12 +110,12 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 
 // Cluster group reduction using shfls, T = double
 template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-        std::is_same_v<T, double>,
-    T>
-masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+                         std::is_same_v<T, double>,
+                     T>
+    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                                const uint32_t MemberMask) {
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     int x_a, x_b;
     asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
@@ -135,12 +134,12 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 
 // Cluster group reduction using shfls, T = float
 template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-        std::is_same_v<T, float>,
-    T>
-masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+                         std::is_same_v<T, float>,
+                     T>
+    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                                const uint32_t MemberMask) {
 
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     auto tmp =
@@ -152,13 +151,12 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 
 // Cluster group reduction using shfls, std::is_integral_v<T>
 template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-        std::is_integral_v<T>,
-    T>
-masked_reduction_cuda_shfls(
-    Group g, T x, BinaryOperation binary_op,
-    const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
+                         std::is_integral_v<T>,
+                     T>
+    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                                const uint32_t MemberMask) {
 
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
@@ -235,8 +233,8 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 
     // __nvvm_fns automatically wraps around to the correct bit position.
     // There is no performance impact on src_set_bit position wrt localSetBit
-    auto tmp = non_uniform_shfl(
-        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+    auto tmp = non_uniform_shfl(g, MemberMask, x,
+                                __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
 
     if (!(localSetBit == 1 && remainder != 0)) {
       x = binary_op(x, tmp);
@@ -263,7 +261,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
   return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
 }
 
-// get_identity is only currently used in this cuda specific header. If in the future it has more general use is should be moved to a more appropriate header.
+// get_identity is only currently used in this cuda specific header. If in the
+// future it has more general use it should be moved to a more appropriate
+// header.
 template <typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<sycl::detail::IsPlus<T, BinaryOperation>::value ||
@@ -327,13 +327,12 @@ GET_ID(IsMaximum, min)
 // Cluster group scan using shfls
 template <__spv::GroupOperation Op, typename Group, typename T,
           class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
-masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                       const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
+    masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
   for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
-    auto tmp =
-        non_uniform_shfl(g, MemberMask, x, i);
+    auto tmp = non_uniform_shfl(g, MemberMask, x, i);
     if (g.get_local_id()[0] >= i)
       x = binary_op(x, tmp);
   }
@@ -361,8 +360,8 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
     int unfoldedSrcSetBit = localSetBit - i;
 
-    auto tmp = non_uniform_shfl(
-        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+    auto tmp = non_uniform_shfl(g, MemberMask, x,
+                                __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
     if (localIdVal >= i)
       x = binary_op(x, tmp);
   }
@@ -376,8 +375,7 @@ masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   return x;
 }
 
-#endif
-#endif
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index cad2c8daf4242..d6365e2ca88f5 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -106,15 +106,11 @@ struct sub_group_mask {
         __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce, OCLMask);
 #else
     unsigned int count = 0;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-    asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits));
-#else
     auto word = (Bits & valuable_bits(bits_num));
     while (word) {
       word &= (word - 1);
       count++;
     }
-#endif
     return count;
 #endif
   }
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 8a3284b3aa029..cdcb8145b190c 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -209,22 +209,18 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
 #if defined(__NVPTX__)
-  sycl::vec<unsigned, 4> MemberMask =
-      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
   if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    sycl::vec<unsigned, 4> MemberMask =
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g));
 #if (__SYCL_CUDA_ARCH__ >= 800)
     return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]);
 #else
     return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]);
 #endif
-  } else {
-    return sycl::detail::calc<__spv::GroupOperation::Reduce>(
-        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
   }
-#else
+#endif
   return sycl::detail::calc<__spv::GroupOperation::Reduce>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
-#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -653,18 +649,14 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
 #if defined(__NVPTX__)
-  sycl::vec<unsigned, 4> MemberMask =
-      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
   if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
-    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]);
-  } else {
-    return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
-        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(
+        g, x, binary_op,
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]);
   }
-#else
+#endif
   return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
-#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -893,18 +885,14 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
 #if defined(__NVPTX__)
-  sycl::vec<unsigned, 4> MemberMask =
-      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
   if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
-    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]);
-  } else {
-    return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
-        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(
+        g, x, binary_op,
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]);
   }
-#else
+#endif
   return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
-#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",

From f3c8665ab62d7c7db804ac7b757a5e3526ea13f5 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 31 May 2023 14:49:38 +0100
Subject: [PATCH 13/22] Format.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/spirv.hpp                | 15 +++++++++------
 .../experimental/cuda/non_uniform_algorithms.hpp  |  4 ++--
 sycl/include/sycl/group_algorithm.hpp             |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 0a65ba9e781ec..e6804ab589426 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -178,7 +178,7 @@ template <typename Group> bool GroupAny(Group, bool pred) {
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
-#if defined (__NVPTX__)
+#if defined(__NVPTX__)
   return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
 #else
   // ballot_group partitions its parent into two groups (0 and 1)
@@ -195,7 +195,7 @@ template <size_t PartitionSize, typename ParentGroup>
 bool GroupAny(
     ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
-#if defined (__NVPTX__)
+#if defined(__NVPTX__)
   return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
 #else
   // GroupNonUniformAny doesn't support cluster size, so use a reduction
@@ -211,7 +211,7 @@ bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
 }
 bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g,
               bool pred) {
-#if defined (__NVPTX__)
+#if defined(__NVPTX__)
   return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
 #else
   return __spirv_GroupNonUniformAny(
@@ -295,7 +295,8 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
   // Remap local_id to its original numbering in ParentGroup.
   auto LocalId = detail::IdToMaskPosition(g, local_id);
 #if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
+                                  LocalId, 31);
 #else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
@@ -325,7 +326,8 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
   // Remap local_id to its original numbering in ParentGroup
   auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
 #if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
+                                  LocalId, 31);
 #else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
@@ -370,7 +372,8 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
   // Remap local_id to its original numbering in sub-group
   auto LocalId = detail::IdToMaskPosition(g, local_id);
 #if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x, LocalId, 31);
+  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
+                                  LocalId, 31);
 #else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index fb0b248ffc2f3..45799002f4372 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
+#include <sycl/detail/spirv.hpp>
 #include <sycl/detail/type_traits.hpp>
 #include <sycl/group.hpp>
-#include <sycl/detail/spirv.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
-  namespace detail {
+namespace detail {
 
 template <typename T, class BinaryOperation>
 using IsRedux =
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index cdcb8145b190c..00c9d16263d1b 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -15,6 +15,7 @@
 #include <sycl/builtins.hpp>
 #include <sycl/detail/spirv.hpp>
 #include <sycl/detail/type_traits.hpp>
+#include <sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp>
 #include <sycl/ext/oneapi/functional.hpp>
 #include <sycl/functional.hpp>
 #include <sycl/group.hpp>
@@ -22,7 +23,6 @@
 #include <sycl/known_identity.hpp>
 #include <sycl/nd_item.hpp>
 #include <sycl/sub_group.hpp>
-#include <sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {

From 57c0bd9ecaac5f72ccd5b6c2b97934df4e887a3a Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 31 May 2023 14:53:38 +0100
Subject: [PATCH 14/22] Format.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/type_traits.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index ac1853e68ac6d..df17a8e945d5d 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -53,8 +53,7 @@ inline constexpr bool is_user_constructed_group_v =
 template <class T> struct is_fixed_size_group : std::false_type {};
 
 template <class T>
-inline constexpr bool is_fixed_size_group_v =
-    is_fixed_size_group<T>::value;
+inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
 
 namespace detail {
 template <typename T> struct is_group_helper : std::false_type {};

From 48785863f98f6ef230db2ea56df3300e5fcf3a2f Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 31 May 2023 15:01:10 +0100
Subject: [PATCH 15/22] Make Is_Redux nvptx only.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 45799002f4372..932b4d88ce894 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -14,6 +14,7 @@
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 
 template <typename T, class BinaryOperation>
 using IsRedux =
@@ -25,8 +26,6 @@ using IsRedux =
                        sycl::detail::IsMinimum<T, BinaryOperation>::value ||
                        sycl::detail::IsMaximum<T, BinaryOperation>::value>;
 
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-
 //// Masked reductions using redux.sync, requires integer types
 
 template <typename Group, typename T, class BinaryOperation>

From 7aa585f734addb64a96d2bdb4ee83a8a5b64391d Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 31 May 2023 15:16:42 +0100
Subject: [PATCH 16/22] Added known_identity.hpp include.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../cuda/non_uniform_algorithms.hpp           | 66 ++++++++-----------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 932b4d88ce894..64bf5b13c4eec 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -10,6 +10,7 @@
 #include <sycl/detail/spirv.hpp>
 #include <sycl/detail/type_traits.hpp>
 #include <sycl/group.hpp>
+#include <sycl/known_identity.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -17,57 +18,49 @@ namespace detail {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 
 template <typename T, class BinaryOperation>
-using IsRedux =
-    std::bool_constant<std::is_integral<T>::value &&
-                           sycl::detail::IsBitAND<T, BinaryOperation>::value ||
-                       sycl::detail::IsBitOR<T, BinaryOperation>::value ||
-                       sycl::detail::IsBitXOR<T, BinaryOperation>::value ||
-                       sycl::detail::IsPlus<T, BinaryOperation>::value ||
-                       sycl::detail::IsMinimum<T, BinaryOperation>::value ||
-                       sycl::detail::IsMaximum<T, BinaryOperation>::value>;
+using IsRedux = std::bool_constant<
+    std::is_integral<T>::value && IsBitAND<T, BinaryOperation>::value ||
+    IsBitOR<T, BinaryOperation>::value || IsBitXOR<T, BinaryOperation>::value ||
+    IsPlus<T, BinaryOperation>::value || IsMinimum<T, BinaryOperation>::value ||
+    IsMaximum<T, BinaryOperation>::value>;
 
 //// Masked reductions using redux.sync, requires integer types
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
-                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
-                 T>
+std::enable_if_t<
+    is_sugeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
   return __nvvm_redux_sync_umin(x, MemberMask);
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
-                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
-                 T>
+std::enable_if_t<
+    is_sigeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
   return __nvvm_redux_sync_min(x, MemberMask);
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
-                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
-                 T>
+std::enable_if_t<
+    is_sugeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
   return __nvvm_redux_sync_umax(x, MemberMask);
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
-                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
-                 T>
+std::enable_if_t<
+    is_sigeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
   return __nvvm_redux_sync_max(x, MemberMask);
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
-                  sycl::detail::is_sigeninteger<T>::value) &&
-                     sycl::detail::IsPlus<T, BinaryOperation>::value,
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsPlus<T, BinaryOperation>::value,
                  T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
@@ -75,9 +68,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
-                  sycl::detail::is_sigeninteger<T>::value) &&
-                     sycl::detail::IsBitAND<T, BinaryOperation>::value,
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitAND<T, BinaryOperation>::value,
                  T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
@@ -85,9 +77,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
-                  sycl::detail::is_sigeninteger<T>::value) &&
-                     sycl::detail::IsBitOR<T, BinaryOperation>::value,
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitOR<T, BinaryOperation>::value,
                  T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
@@ -95,9 +86,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 
 template <typename Group, typename T, class BinaryOperation>
-std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
-                  sycl::detail::is_sigeninteger<T>::value) &&
-                     sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitXOR<T, BinaryOperation>::value,
                  T>
 masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
@@ -265,9 +255,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 // header.
 template <typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<sycl::detail::IsPlus<T, BinaryOperation>::value ||
-                         sycl::detail::IsBitOR<T, BinaryOperation>::value ||
-                         sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+    std::enable_if_t<IsPlus<T, BinaryOperation>::value ||
+                         IsBitOR<T, BinaryOperation>::value ||
+                         IsBitXOR<T, BinaryOperation>::value,
                      T>
     get_identity() {
   return 0;
@@ -275,14 +265,14 @@ inline __SYCL_ALWAYS_INLINE
 
 template <typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<sycl::detail::IsMultiplies<T, BinaryOperation>::value, T>
+    std::enable_if_t<IsMultiplies<T, BinaryOperation>::value, T>
     get_identity() {
   return 1;
 }
 
 template <typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<sycl::detail::IsBitAND<T, BinaryOperation>::value, T>
+    std::enable_if_t<IsBitAND<T, BinaryOperation>::value, T>
     get_identity() {
   return ~0;
 }
@@ -290,7 +280,7 @@ inline __SYCL_ALWAYS_INLINE
 #define GET_ID(OP_CHECK, OP)                                                   \
   template <typename T, class BinaryOperation>                                 \
   inline __SYCL_ALWAYS_INLINE                                                  \
-      std::enable_if_t<sycl::detail::OP_CHECK<T, BinaryOperation>::value, T>   \
+      std::enable_if_t<OP_CHECK<T, BinaryOperation>::value, T>                 \
       get_identity() {                                                         \
     if constexpr (std::is_same_v<T, char>) {                                   \
       return std::numeric_limits<char>::OP();                                  \

From a40e410914c763b99a9ba0fcd632b04f092704ce Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 1 Jun 2023 03:32:42 -0700
Subject: [PATCH 17/22] Addressed review comments.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../cuda/non_uniform_algorithms.hpp           | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 64bf5b13c4eec..0911d5d1faab9 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -97,7 +97,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 
 //// Shuffle based masked reduction impls
 
-// Cluster group reduction using shfls, T = double
+// fixed_size_group group reduction using shfls, T = double
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
@@ -121,7 +121,7 @@ inline __SYCL_ALWAYS_INLINE
   return x;
 }
 
-// Cluster group reduction using shfls, T = float
+// fixed_size_group group reduction using shfls, T = float
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
@@ -138,7 +138,7 @@ inline __SYCL_ALWAYS_INLINE
   return x;
 }
 
-// Cluster group reduction using shfls, std::is_integral_v<T>
+// fixed_size_group group reduction using shfls, std::is_integral_v<T>
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
@@ -157,31 +157,31 @@ inline __SYCL_ALWAYS_INLINE
 template <typename Group, typename T>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
-non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) {
+non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) {
   if constexpr (ext::oneapi::experimental::is_fixed_size_group_v<Group>) {
-    return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0);
+    return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0);
   } else {
-    return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31);
+    return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31);
   }
 }
 
 template <typename Group, typename T>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
-non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) {
+non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) {
   T res;
   if constexpr (std::is_same_v<T, double>) {
     int x_a, x_b;
     asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
 
-    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, delta);
-    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, delta);
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
     asm volatile("mov.b64 %0,{%1,%2}; \n\t"
                  : "=l"(res)
                  : "r"(tmp_a), "r"(tmp_b));
   } else {
     auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
-    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, input, delta);
+    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, input, shfl_param);
     res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
   }
   return res;
@@ -313,7 +313,7 @@ GET_ID(IsMaximum, min)
 
 //// Shuffle based masked reduction impls
 
-// Cluster group scan using shfls
+// fixed_size_group group scan using shfls
 template <__spv::GroupOperation Op, typename Group, typename T,
           class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE

From f20c936ac9f72651bc1cd603be3d7aa021d80177 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 1 Jun 2023 17:52:33 +0100
Subject: [PATCH 18/22] Removed unrequired includes.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index 0911d5d1faab9..c03eef3d1605a 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <sycl/detail/spirv.hpp>
-#include <sycl/detail/type_traits.hpp>
-#include <sycl/group.hpp>
 #include <sycl/known_identity.hpp>
 
 namespace sycl {

From dcffcbeb664beac32999e5f421e93107a490d14e Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 1 Jun 2023 21:52:11 +0100
Subject: [PATCH 19/22] Removed breaking opportunistic_group include.

forward decl of opportunistic_group is failing in spirv.hpp for nvptx so I removed nvptx specific usages from spirv.hpp

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/spirv.hpp    | 41 ---------------------------
 sycl/include/sycl/group_algorithm.hpp | 25 ++++++++++++++++
 2 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index e6804ab589426..278c019269a7d 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -16,7 +16,6 @@
 #include <sycl/detail/type_list.hpp>
 #include <sycl/detail/type_traits.hpp>
 #include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
-#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
 #include <sycl/id.hpp>
 #include <sycl/memory_enums.hpp>
 
@@ -130,9 +129,6 @@ template <typename Group> bool GroupAll(Group, bool pred) {
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -141,21 +137,16 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
   }
-#endif
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAll(
     ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   // GroupNonUniformAll doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseAnd(
       group_scope<ParentGroup>::value,
       static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
       static_cast<uint32_t>(pred), PartitionSize);
-#endif
 }
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
@@ -164,12 +155,8 @@ bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
 
 bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g,
               bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   return __spirv_GroupNonUniformAll(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
-#endif
 }
 
 template <typename Group> bool GroupAny(Group, bool pred) {
@@ -178,9 +165,6 @@ template <typename Group> bool GroupAny(Group, bool pred) {
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -189,21 +173,16 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
   }
-#endif
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAny(
     ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   // GroupNonUniformAny doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseOr(
       group_scope<ParentGroup>::value,
       static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
       static_cast<uint32_t>(pred), PartitionSize);
-#endif
 }
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
@@ -211,12 +190,8 @@ bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
 }
 bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g,
               bool pred) {
-#if defined(__NVPTX__)
-  return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0], pred);
-#else
   return __spirv_GroupNonUniformAny(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
-#endif
 }
 
 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
@@ -294,10 +269,6 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
                T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup.
   auto LocalId = detail::IdToMaskPosition(g, local_id);
-#if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
-                                  LocalId, 31);
-#else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -317,7 +288,6 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
                                             OCLX, OCLId);
   }
-#endif
 }
 template <size_t PartitionSize, typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
@@ -325,10 +295,6 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
     T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup
   auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
-#if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
-                                  LocalId, 31);
-#else
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -344,7 +310,6 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
   // partition, and it's unclear which will be faster in practice.
   return __spirv_GroupNonUniformShuffle(group_scope<ParentGroup>::value, OCLX,
                                         OCLId);
-#endif
 }
 template <typename ParentGroup, typename T, typename IdT>
 EnableIfNativeBroadcast<T, IdT>
@@ -371,11 +336,6 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
                IdT local_id) {
   // Remap local_id to its original numbering in sub-group
   auto LocalId = detail::IdToMaskPosition(g, local_id);
-#if defined(__NVPTX__)
-  return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0], x,
-                                  LocalId, 31);
-#else
-  // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
   using OCLT = detail::ConvertToOpenCLType_t<T>;
@@ -387,7 +347,6 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
   return __spirv_GroupNonUniformBroadcast(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, OCLX,
       OCLId);
-#endif
 }
 
 template <typename Group, typename T, typename IdT>
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 00c9d16263d1b..1233b8bb4833a 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -387,6 +387,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 any_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAny(g, pred);
 #else
   (void)g;
@@ -427,6 +433,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 all_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAll(g, pred);
 #else
   (void)g;
@@ -467,6 +479,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 none_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                !pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAll(g, !pred);
 #else
   (void)g;
@@ -585,6 +603,13 @@ std::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                  T>
 group_broadcast(Group g, T x, typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    auto LocalId = detail::IdToMaskPosition(g, local_id);
+    return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                    x, LocalId, 31);
+  }
+#endif
   return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)g;

From 9844fcba5b456fd024c42a6218029c1e0bbc6eba Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 2 Jun 2023 09:56:38 +0100
Subject: [PATCH 20/22] Revert unrequired changes.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/spirv.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 278c019269a7d..48631bc800033 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -140,7 +140,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAll(
-    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
     bool pred) {
   // GroupNonUniformAll doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseAnd(
@@ -153,7 +153,7 @@ bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
   return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
 }
 
-bool GroupAll(const ext::oneapi::experimental::opportunistic_group &g,
+bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
               bool pred) {
   return __spirv_GroupNonUniformAll(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
@@ -176,7 +176,7 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAny(
-    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
     bool pred) {
   // GroupNonUniformAny doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseOr(
@@ -188,7 +188,7 @@ template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
   return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
 }
-bool GroupAny(const ext::oneapi::experimental::opportunistic_group &g,
+bool GroupAny(const ext::oneapi::experimental::opportunistic_group &,
               bool pred) {
   return __spirv_GroupNonUniformAny(
       group_scope<ext::oneapi::experimental::opportunistic_group>::value, pred);
@@ -269,6 +269,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
                T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup.
   auto LocalId = detail::IdToMaskPosition(g, local_id);
+
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -295,6 +296,7 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(
     T x, IdT local_id) {
   // Remap local_id to its original numbering in ParentGroup
   auto LocalId = g.get_group_linear_id() * PartitionSize + local_id;
+
   // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<ParentGroup>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
@@ -336,6 +338,8 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x,
                IdT local_id) {
   // Remap local_id to its original numbering in sub-group
   auto LocalId = detail::IdToMaskPosition(g, local_id);
+
+  // TODO: Refactor to avoid duplication after design settles.
   using GroupIdT = typename GroupId<sycl::ext::oneapi::sub_group>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
   using OCLT = detail::ConvertToOpenCLType_t<T>;

From eea1d9ab0d66354034a3872936032ada47ee57c8 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 9 Jun 2023 21:25:17 +0100
Subject: [PATCH 21/22] Added missing types.

Refactored impl.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../cuda/non_uniform_algorithms.hpp           | 172 ++++++++----------
 1 file changed, 72 insertions(+), 100 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index c03eef3d1605a..c149dcae16618 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -94,58 +94,42 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 
 //// Shuffle based masked reduction impls
 
-// fixed_size_group group reduction using shfls, T = double
+// fixed_size_group group reduction using shfls
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-                         std::is_same_v<T, double>,
-                     T>
-    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                                const uint32_t MemberMask) {
-  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
-    int x_a, x_b;
-    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
-
-    auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
-    auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
-    double tmp;
-    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
-                 : "=l"(tmp)
-                 : "r"(tmp_a), "r"(tmp_b));
-    x = binary_op(x, tmp);
-  }
-
-  return x;
-}
-
-// fixed_size_group group reduction using shfls, T = float
-template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-                         std::is_same_v<T, float>,
-                     T>
-    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                                const uint32_t MemberMask) {
-
-  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
-    auto tmp =
-        __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
-    x = binary_op(x, __nvvm_bitcast_i2f(tmp));
-  }
-  return x;
-}
-
-// fixed_size_group group reduction using shfls, std::is_integral_v<T>
-template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group> &&
-                         std::is_integral_v<T>,
-                     T>
+    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
     masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                                 const uint32_t MemberMask) {
-
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
-    auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+    T tmp;
+    if constexpr (std::is_same_v<T, double>) {
+      int x_a, x_b;
+      asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
+      auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+      auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+      asm volatile("mov.b64 %0,{%1,%2};" : "=d"(tmp) : "r"(tmp_a), "r"(tmp_b));
+    } else if constexpr (std::is_same_v<T, long> ||
+                         std::is_same_v<T, unsigned long>) {
+      int x_a, x_b;
+      asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
+      auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+      auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+      asm volatile("mov.b64 %0,{%1,%2};" : "=l"(tmp) : "r"(tmp_a), "r"(tmp_b));
+    } else if constexpr (std::is_same_v<T, half>) {
+      short tmp_b16;
+      asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
+      auto tmp_b32 = __nvvm_shfl_sync_bfly_i32(
+          MemberMask, static_cast<int>(tmp_b16), -1, i);
+      asm volatile("mov.b16 %0,%1;"
+                   : "=h"(tmp)
+                   : "h"(static_cast<short>(tmp_b32)));
+    } else if constexpr (std::is_same_v<T, float>) {
+      auto tmp_b32 =
+          __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
+      tmp = __nvvm_bitcast_i2f(tmp_b32);
+    } else {
+      tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+    }
     x = binary_op(x, tmp);
   }
   return x;
@@ -169,17 +153,31 @@ non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) {
   T res;
   if constexpr (std::is_same_v<T, double>) {
     int x_a, x_b;
-    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
-
+    asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
     auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
     auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
-    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
-                 : "=l"(res)
-                 : "r"(tmp_a), "r"(tmp_b));
+    asm volatile("mov.b64 %0,{%1,%2};" : "=d"(res) : "r"(tmp_a), "r"(tmp_b));
+  } else if constexpr (std::is_same_v<T, long> ||
+                       std::is_same_v<T, unsigned long>) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
+    asm volatile("mov.b64 %0,{%1,%2};" : "=l"(res) : "r"(tmp_a), "r"(tmp_b));
+  } else if constexpr (std::is_same_v<T, half>) {
+    short tmp_b16;
+    asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
+    auto tmp_b32 = non_uniform_shfl_T<Group>(
+        MemberMask, static_cast<int>(tmp_b16), shfl_param);
+    asm volatile("mov.b16 %0,%1;"
+                 : "=h"(res)
+                 : "h"(static_cast<short>(tmp_b32)));
+  } else if constexpr (std::is_same_v<T, float>) {
+    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, __nvvm_bitcast_f2i(x),
+                                             shfl_param);
+    res = __nvvm_bitcast_i2f(tmp_b32);
   } else {
-    auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
-    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, input, shfl_param);
-    res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+    res = non_uniform_shfl_T<Group>(MemberMask, x, shfl_param);
   }
   return res;
 }
@@ -192,27 +190,18 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                             const uint32_t MemberMask) {
-  if (MemberMask == 0xffffffff) {
-    for (int i = 16; i > 0; i /= 2) {
-      auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
-      x = binary_op(x, tmp);
-    }
-    return x;
-  }
 
   unsigned localSetBit = g.get_local_id()[0] + 1;
 
   // number of elements requiring binary operations each loop iteration
   auto opRange = g.get_local_range()[0];
 
-  // remainder that won't have a binary partner each loop iteration
-  int remainder;
-
-  while (opRange / 2 >= 1) {
-    remainder = opRange % 2;
+  // stride between local_ids forming a binary op
+  unsigned stride = opRange / 2;
+  while (stride >= 1) {
 
-    // stride between local_ids forming a binary op
-    int stride = opRange / 2;
+    // if (remainder == 1), there is a WI without a binary op partner
+    unsigned remainder = opRange % 2;
 
     // unfolded position of set bit in mask of shfl src lane
     int unfoldedSrcSetBit = localSetBit + stride;
@@ -226,12 +215,15 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
       x = binary_op(x, tmp);
     }
 
-    opRange = std::ceil((float)opRange / 2.0f);
+    opRange = stride + remainder;
+    stride = opRange / 2;
   }
-  int broadID;
-  int maskRev;
-  asm volatile("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
-  asm volatile("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
+  unsigned broadID;
+  asm volatile(".reg .u32 rev;\n\t"
+               "brev.b32 rev, %1;\n\t" // reverse mask bits
+               "clz.b32 %0, rev;"
+               : "=r"(broadID)
+               : "r"(MemberMask));
 
   return non_uniform_shfl(g, MemberMask, x, broadID);
 }
@@ -279,28 +271,7 @@ inline __SYCL_ALWAYS_INLINE
   inline __SYCL_ALWAYS_INLINE                                                  \
       std::enable_if_t<OP_CHECK<T, BinaryOperation>::value, T>                 \
       get_identity() {                                                         \
-    if constexpr (std::is_same_v<T, char>) {                                   \
-      return std::numeric_limits<char>::OP();                                  \
-    } else if constexpr (std::is_same_v<T, unsigned char>) {                   \
-      return std::numeric_limits<unsigned char>::OP();                         \
-    } else if constexpr (std::is_same_v<T, short>) {                           \
-      return std::numeric_limits<short>::OP();                                 \
-    } else if constexpr (std::is_same_v<T, unsigned short>) {                  \
-      return std::numeric_limits<unsigned short>::OP();                        \
-    } else if constexpr (std::is_same_v<T, int>) {                             \
-      return std::numeric_limits<int>::OP();                                   \
-    } else if constexpr (std::is_same_v<T, unsigned int>) {                    \
-      return std::numeric_limits<unsigned int>::OP();                          \
-    } else if constexpr (std::is_same_v<T, long>) {                            \
-      return std::numeric_limits<int>::OP();                                   \
-    } else if constexpr (std::is_same_v<T, unsigned long>) {                   \
-      return std::numeric_limits<unsigned int>::OP();                          \
-    } else if constexpr (std::is_same_v<T, float>) {                           \
-      return std::numeric_limits<float>::OP();                                 \
-    } else if constexpr (std::is_same_v<T, double>) {                          \
-      return std::numeric_limits<double>::OP();                                \
-    }                                                                          \
-    return 0;                                                                  \
+    return std::numeric_limits<T>::OP();                                       \
   }
 
 GET_ID(IsMinimum, max)
@@ -317,15 +288,16 @@ inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
     masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                            const uint32_t MemberMask) {
+  unsigned localIdVal = g.get_local_id()[0];
   for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
     auto tmp = non_uniform_shfl(g, MemberMask, x, i);
-    if (g.get_local_id()[0] >= i)
+    if (localIdVal >= i)
       x = binary_op(x, tmp);
   }
   if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
 
     x = non_uniform_shfl(g, MemberMask, x, 1);
-    if (g.get_local_id()[0] == 0) {
+    if (localIdVal == 0) {
       return get_identity<T, BinaryOperation>();
     }
   }
@@ -340,8 +312,8 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     T>
 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                        const uint32_t MemberMask) {
-  int localIdVal = g.get_local_id()[0];
-  int localSetBit = localIdVal + 1;
+  unsigned localIdVal = g.get_local_id()[0];
+  unsigned localSetBit = localIdVal + 1;
 
   for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
     int unfoldedSrcSetBit = localSetBit - i;

From 686d11727028759b4b857b3968b6afbc408325b9 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 9 Jun 2023 21:41:49 +0100
Subject: [PATCH 22/22] is_fixed_size_group moved to detail namespace.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/type_traits.hpp      | 12 ++++++-----
 .../cuda/non_uniform_algorithms.hpp           | 20 +++++++++----------
 .../oneapi/experimental/fixed_size_group.hpp  |  9 ++++++---
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index df17a8e945d5d..be072531a7a14 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -20,6 +20,13 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+template <class T> struct is_fixed_size_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
+} // namespace detail
+
 template <int Dimensions> class group;
 namespace ext::oneapi {
 struct sub_group;
@@ -50,11 +57,6 @@ template <class T>
 inline constexpr bool is_user_constructed_group_v =
     is_user_constructed_group<T>::value;
 
-template <class T> struct is_fixed_size_group : std::false_type {};
-
-template <class T>
-inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
-
 namespace detail {
 template <typename T> struct is_group_helper : std::false_type {};
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
index c149dcae16618..eea68d89a35fe 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -96,10 +96,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 
 // fixed_size_group group reduction using shfls
 template <typename Group, typename T, class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
-    masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                                const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     T tmp;
     if constexpr (std::is_same_v<T, double>) {
@@ -139,7 +138,7 @@ template <typename Group, typename T>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
 non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) {
-  if constexpr (ext::oneapi::experimental::is_fixed_size_group_v<Group>) {
+  if constexpr (is_fixed_size_group_v<Group>) {
     return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0);
   } else {
     return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31);
@@ -186,7 +185,7 @@ non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) {
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
-        !ext::oneapi::experimental::is_fixed_size_group_v<Group>,
+        !is_fixed_size_group_v<Group>,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                             const uint32_t MemberMask) {
@@ -284,10 +283,9 @@ GET_ID(IsMaximum, min)
 // fixed_size_group group scan using shfls
 template <__spv::GroupOperation Op, typename Group, typename T,
           class BinaryOperation>
-inline __SYCL_ALWAYS_INLINE
-    std::enable_if_t<ext::oneapi::experimental::is_fixed_size_group_v<Group>, T>
-    masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                           const uint32_t MemberMask) {
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                       const uint32_t MemberMask) {
   unsigned localIdVal = g.get_local_id()[0];
   for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
     auto tmp = non_uniform_shfl(g, MemberMask, x, i);
@@ -308,7 +306,7 @@ template <__spv::GroupOperation Op, typename Group, typename T,
           class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
-        !ext::oneapi::experimental::is_fixed_size_group_v<Group>,
+        !is_fixed_size_group_v<Group>,
     T>
 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                        const uint32_t MemberMask) {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index d51a18c3f2725..cbc6be038f4ba 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -161,11 +161,14 @@ template <size_t PartitionSize, typename ParentGroup>
 struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>
     : std::true_type {};
 
+} // namespace ext::oneapi::experimental
+
+namespace detail {
 template <size_t PartitionSize, typename ParentGroup>
-struct is_fixed_size_group<fixed_size_group<PartitionSize, ParentGroup>>
+struct is_fixed_size_group<
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
     : std::true_type {};
-
-} // namespace ext::oneapi::experimental
+} // namespace detail
 
 template <size_t PartitionSize, typename ParentGroup>
 struct is_group<