Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bb39b47
Basic cuda support for opportunistic_group, fixed_size_group, and
JackAKirk May 2, 2023
4ca058c
Fix test failure, add comment in libclc.
JackAKirk May 5, 2023
df0b97d
Merge branch 'sycl' into cuda-non-uniform-groups
JackAKirk May 9, 2023
369f25f
format
JackAKirk May 9, 2023
f443f81
format
JackAKirk May 9, 2023
1cb03df
Merge branch 'sycl' into cuda-non-uniform-algs
JackAKirk May 18, 2023
3069a1e
Optimized `IdToMaskPosition` NVPTX case.
JackAKirk May 19, 2023
0b1b82a
barrier, broadcast, any_of, all_of, none_of impls
JackAKirk May 19, 2023
5a9668b
reduce/scan impls.
JackAKirk May 19, 2023
4188a17
is_fixed_size_group check for reduce/scan branch impls
JackAKirk May 19, 2023
cf3d4e7
cuda reduce/scans use non_uniform_algorithms.hpp
JackAKirk May 19, 2023
ebb034b
Merge branch 'cuda-non-uniform-groups' into cuda-non-uniform-algs
JackAKirk May 19, 2023
cf55f58
Enabled cuda in algorithm tests.
JackAKirk May 19, 2023
c045fc5
Added missing volatile.
JackAKirk May 19, 2023
37df9fa
Merge branch 'sycl' into cuda-non-uniform-algs
May 31, 2023
36e59bb
Format and fixed sycl branch merge.
May 31, 2023
f3c8665
Format.
May 31, 2023
57c0bd9
Format.
May 31, 2023
4878586
Make Is_Redux nvptx only.
May 31, 2023
7aa585f
Added known_identity.hpp include.
May 31, 2023
a40e410
Addressed review comments.
JackAKirk Jun 1, 2023
f20c936
Removed unrequired includes.
Jun 1, 2023
dcffcbe
Removed breaking opportunistic_group include.
Jun 1, 2023
9844fcb
Revert unrequired changes.
Jun 2, 2023
557a1a3
Merge branch 'sycl' into cuda-non-uniform-algs
Jun 5, 2023
eea1d9a
Added missing types.
Jun 9, 2023
686d117
is_fixed_size_group moved to detail namespace.
Jun 9, 2023
82691c5
Merge branch 'sycl' into cuda-non-uniform-algs
Jun 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ template <typename ParentGroup>
bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}
template <typename Group>

bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
bool pred) {
return __spirv_GroupNonUniformAll(
Expand Down Expand Up @@ -1022,8 +1022,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
template <typename Group>
typename std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group>>
ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
#if defined(__SPIR__)
ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
#if defined(__NVPTX__)
__nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
#else
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it intentional change from
#if defined (SPIR)
#elif (NVPTX)
no else here
#endif
to
#if defined (NVPTX)
#else + SPIR
#endif
?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it like this purely to be consistent with other cases which are not doing any checks and just calling the _spirv functions directly. I'm not sure what is best here: @Pennycook what do you prefer?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no preference. But what you've done here is consistent with other parts of DPC++, at least. For example, the sub-group implementation assumes that SPIR-V intrinsics will be supported. I think this makes sense, because some SPIR-V intrinsics are implemented in libclc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, agree

// SPIR-V does not define an instruction to synchronize partial groups.
// However, most (possibly all?) of the current SPIR-V targets execute
// work-items in lockstep, so we can probably get away with a MemoryBarrier.
Expand All @@ -1033,8 +1035,6 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
__spv::MemorySemanticsMask::SubgroupMemory |
__spv::MemorySemanticsMask::WorkgroupMemory |
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
#elif defined(__NVPTX__)
// TODO: Call syncwarp with appropriate mask extracted from the group
#endif
}

Expand Down
7 changes: 7 additions & 0 deletions sycl/include/sycl/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace detail {
template <class T> struct is_fixed_size_group : std::false_type {};

template <class T>
inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
} // namespace detail

template <int Dimensions> class group;
namespace ext::oneapi {
struct sub_group;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
//==----- non_uniform_algorithms.hpp - cuda masked subgroup algorithms -----==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once
#include <sycl/known_identity.hpp>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace detail {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)

template <typename T, class BinaryOperation>
using IsRedux = std::bool_constant<
std::is_integral<T>::value && IsBitAND<T, BinaryOperation>::value ||
IsBitOR<T, BinaryOperation>::value || IsBitXOR<T, BinaryOperation>::value ||
IsPlus<T, BinaryOperation>::value || IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value>;

//// Masked reductions using redux.sync, requires integer types

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<
is_sugeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_umin(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<
is_sigeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_min(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<
is_sugeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_umax(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<
is_sigeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_max(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
IsPlus<T, BinaryOperation>::value,
T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_add(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
IsBitAND<T, BinaryOperation>::value,
T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_and(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
IsBitOR<T, BinaryOperation>::value,
T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_or(x, MemberMask);
}

template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
IsBitXOR<T, BinaryOperation>::value,
T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return __nvvm_redux_sync_xor(x, MemberMask);
}
////

//// Shuffle based masked reduction impls

// fixed_size_group group reduction using shfls
template <typename Group, typename T, class BinaryOperation>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
T tmp;
if constexpr (std::is_same_v<T, double>) {
int x_a, x_b;
asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
asm volatile("mov.b64 %0,{%1,%2};" : "=d"(tmp) : "r"(tmp_a), "r"(tmp_b));
} else if constexpr (std::is_same_v<T, long> ||
std::is_same_v<T, unsigned long>) {
int x_a, x_b;
asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
asm volatile("mov.b64 %0,{%1,%2};" : "=l"(tmp) : "r"(tmp_a), "r"(tmp_b));
} else if constexpr (std::is_same_v<T, half>) {
short tmp_b16;
asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
auto tmp_b32 = __nvvm_shfl_sync_bfly_i32(
MemberMask, static_cast<int>(tmp_b16), -1, i);
asm volatile("mov.b16 %0,%1;"
: "=h"(tmp)
: "h"(static_cast<short>(tmp_b32)));
} else if constexpr (std::is_same_v<T, float>) {
auto tmp_b32 =
__nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
tmp = __nvvm_bitcast_i2f(tmp_b32);
} else {
tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
}
x = binary_op(x, tmp);
}
return x;
}

template <typename Group, typename T>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) {
if constexpr (is_fixed_size_group_v<Group>) {
return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0);
} else {
return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31);
}
}

template <typename Group, typename T>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) {
T res;
if constexpr (std::is_same_v<T, double>) {
int x_a, x_b;
asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
asm volatile("mov.b64 %0,{%1,%2};" : "=d"(res) : "r"(tmp_a), "r"(tmp_b));
} else if constexpr (std::is_same_v<T, long> ||
std::is_same_v<T, unsigned long>) {
int x_a, x_b;
asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
asm volatile("mov.b64 %0,{%1,%2};" : "=l"(res) : "r"(tmp_a), "r"(tmp_b));
} else if constexpr (std::is_same_v<T, half>) {
short tmp_b16;
asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
auto tmp_b32 = non_uniform_shfl_T<Group>(
MemberMask, static_cast<int>(tmp_b16), shfl_param);
asm volatile("mov.b16 %0,%1;"
: "=h"(res)
: "h"(static_cast<short>(tmp_b32)));
} else if constexpr (std::is_same_v<T, float>) {
auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, __nvvm_bitcast_f2i(x),
shfl_param);
res = __nvvm_bitcast_i2f(tmp_b32);
} else {
res = non_uniform_shfl_T<Group>(MemberMask, x, shfl_param);
}
return res;
}

// Opportunistic/Ballot group reduction using shfls
template <typename Group, typename T, class BinaryOperation>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
!is_fixed_size_group_v<Group>,
T>
masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {

unsigned localSetBit = g.get_local_id()[0] + 1;

// number of elements requiring binary operations each loop iteration
auto opRange = g.get_local_range()[0];

// stride between local_ids forming a binary op
unsigned stride = opRange / 2;
while (stride >= 1) {

// if (remainder == 1), there is a WI without a binary op partner
unsigned remainder = opRange % 2;

// unfolded position of set bit in mask of shfl src lane
int unfoldedSrcSetBit = localSetBit + stride;

// __nvvm_fns automatically wraps around to the correct bit position.
// There is no performance impact on src_set_bit position wrt localSetBit
auto tmp = non_uniform_shfl(g, MemberMask, x,
__nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));

if (!(localSetBit == 1 && remainder != 0)) {
x = binary_op(x, tmp);
}

opRange = stride + remainder;
stride = opRange / 2;
}
unsigned broadID;
asm volatile(".reg .u32 rev;\n\t"
"brev.b32 rev, %1;\n\t" // reverse mask bits
"clz.b32 %0, rev;"
: "=r"(broadID)
: "r"(MemberMask));

return non_uniform_shfl(g, MemberMask, x, broadID);
}

// Non Redux types must fall back to shfl based implementations.
template <typename Group, typename T, class BinaryOperation>
std::enable_if_t<
std::is_same<IsRedux<T, BinaryOperation>, std::false_type>::value &&
ext::oneapi::experimental::is_user_constructed_group_v<Group>,
T>
masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
}

// get_identity is only currently used in this cuda specific header. If in the
// future it has more general use it should be moved to a more appropriate
// header.
template <typename T, class BinaryOperation>
inline __SYCL_ALWAYS_INLINE
std::enable_if_t<IsPlus<T, BinaryOperation>::value ||
IsBitOR<T, BinaryOperation>::value ||
IsBitXOR<T, BinaryOperation>::value,
T>
get_identity() {
return 0;
}

template <typename T, class BinaryOperation>
inline __SYCL_ALWAYS_INLINE
std::enable_if_t<IsMultiplies<T, BinaryOperation>::value, T>
get_identity() {
return 1;
}

template <typename T, class BinaryOperation>
inline __SYCL_ALWAYS_INLINE
std::enable_if_t<IsBitAND<T, BinaryOperation>::value, T>
get_identity() {
return ~0;
}

#define GET_ID(OP_CHECK, OP) \
template <typename T, class BinaryOperation> \
inline __SYCL_ALWAYS_INLINE \
std::enable_if_t<OP_CHECK<T, BinaryOperation>::value, T> \
get_identity() { \
return std::numeric_limits<T>::OP(); \
}

GET_ID(IsMinimum, max)
GET_ID(IsMaximum, min)

#undef GET_ID

//// Shuffle based masked reduction impls

// fixed_size_group group scan using shfls
template <__spv::GroupOperation Op, typename Group, typename T,
class BinaryOperation>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
unsigned localIdVal = g.get_local_id()[0];
for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
auto tmp = non_uniform_shfl(g, MemberMask, x, i);
if (localIdVal >= i)
x = binary_op(x, tmp);
}
if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {

x = non_uniform_shfl(g, MemberMask, x, 1);
if (localIdVal == 0) {
return get_identity<T, BinaryOperation>();
}
}
return x;
}

template <__spv::GroupOperation Op, typename Group, typename T,
class BinaryOperation>
inline __SYCL_ALWAYS_INLINE std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
!is_fixed_size_group_v<Group>,
T>
masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
const uint32_t MemberMask) {
unsigned localIdVal = g.get_local_id()[0];
unsigned localSetBit = localIdVal + 1;

for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
int unfoldedSrcSetBit = localSetBit - i;

auto tmp = non_uniform_shfl(g, MemberMask, x,
__nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
if (localIdVal >= i)
x = binary_op(x, tmp);
}
if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
x = non_uniform_shfl(g, MemberMask, x,
__nvvm_fns(MemberMask, 0, localSetBit - 1));
if (localIdVal == 0) {
return get_identity<T, BinaryOperation>();
}
}
return x;
}

#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
} // namespace detail
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>

} // namespace ext::oneapi::experimental

namespace detail {
template <size_t PartitionSize, typename ParentGroup>
struct is_fixed_size_group<
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
: std::true_type {};
} // namespace detail

template <size_t PartitionSize, typename ParentGroup>
struct is_group<
ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
Expand Down
Loading