From 1b09219941dd9232d2c433c68b5b0d161efaf3ac Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 15 Jan 2025 16:23:02 +0000 Subject: [PATCH 1/4] Optimize/(fix?) permute_sub_group_by_xor Signed-off-by: JackAKirk --- sycl/include/syclcompat/util.hpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index 2fb085509cf6a..f476b60bda401 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -306,14 +306,18 @@ T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, template T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size = 32) { - unsigned int id = g.get_local_linear_id(); - unsigned int start_index = - id / logical_sub_group_size * logical_sub_group_size; - unsigned int target_offset = (id % logical_sub_group_size) ^ mask; - return sycl::select_from_group(g, x, - target_offset < logical_sub_group_size - ? start_index + target_offset - : id); + if (logical_sub_group_size == 32) { + return permute_group_by_xor(g, x, mask); + } else { + unsigned int id = g.get_local_linear_id(); + unsigned int start_index = + id / logical_sub_group_size * logical_sub_group_size; + unsigned int target_offset = (id % logical_sub_group_size) ^ mask; + return sycl::select_from_group(g, x, + target_offset < logical_sub_group_size + ? start_index + target_offset + : id); + } } namespace experimental { From bf13d41ec0428ec1d287ccdc6feafbbe88fa84c1 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 16 Jan 2025 11:28:13 +0000 Subject: [PATCH 2/4] Split test into two test cases for easier debugging. Signed-off-by: JackAKirk --- .../util/util_permute_sub_group_by_xor.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp index 7b877d826f18b..7b6d3c9c2b029 100644 --- a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp +++ b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp @@ -86,11 +86,9 @@ void test_permute_sub_group_by_xor() { syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device(); sycl::queue *q_ct1 = dev_ct1.default_queue(); bool Result = true; - int *dev_data = nullptr; unsigned int *dev_data_u = nullptr; sycl::range<3> GridSize(1, 1, 1); sycl::range<3> BlockSize(1, 1, 1); - dev_data = sycl::malloc_device(DATA_NUM, *q_ct1); dev_data_u = sycl::malloc_device(DATA_NUM, *q_ct1); GridSize = sycl::range<3>(1, 1, 2); @@ -120,6 +118,19 @@ void test_permute_sub_group_by_xor() { q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int)) .wait(); verify_data(host_dev_data_u, expect1, DATA_NUM); + sycl::free(dev_data_u, *q_ct1); +} + +void test_permute_sub_group_by_xor_extra_arg() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device(); + sycl::queue *q_ct1 = dev_ct1.default_queue(); + bool Result = true; + unsigned int *dev_data_u = nullptr; + sycl::range<3> GridSize(1, 1, 1); + sycl::range<3> BlockSize(1, 1, 1); + dev_data_u = sycl::malloc_device(DATA_NUM, *q_ct1); GridSize = sycl::range<3>(1, 1, 2); BlockSize = sycl::range<3>(1, 2, 32); @@ -147,13 +158,12 @@ void test_permute_sub_group_by_xor() { q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int)) .wait(); verify_data(host_dev_data_u, expect2, DATA_NUM); - - sycl::free(dev_data, *q_ct1); sycl::free(dev_data_u, *q_ct1); } int main() { test_permute_sub_group_by_xor(); + test_permute_sub_group_by_xor_extra_arg(); return 0; } From affc058665e739602f33c988c50d3567f78be2e1 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 16 Jan 2025 12:09:58 +0000 Subject: [PATCH 3/4] Add missing host_dev_data_u Signed-off-by: JackAKirk --- sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp index 7b6d3c9c2b029..6b0b478b1e367 100644 --- a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp +++ b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp @@ -144,6 +144,7 @@ void test_permute_sub_group_by_xor_extra_arg() { 91, 90, 93, 92, 95, 94, 97, 96, 99, 98, 101, 100, 103, 102, 105, 104, 107, 106, 109, 108, 111, 110, 113, 112, 115, 114, 117, 116, 119, 118, 121, 120, 123, 122, 125, 124, 127, 126}; + unsigned int host_dev_data_u[DATA_NUM]; init_data(host_dev_data_u, DATA_NUM); q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int)) From bfe680ed16f04a0bfd2269a8412cbfdb1c606501 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 16 Jan 2025 13:53:09 +0000 Subject: [PATCH 4/4] Remove unnecessary else statement Signed-off-by: JackAKirk --- sycl/include/syclcompat/util.hpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index f476b60bda401..a1aff77bfe745 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -308,16 +308,15 @@ T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, int logical_sub_group_size = 32) { if (logical_sub_group_size == 32) { return permute_group_by_xor(g, x, mask); - } else { - unsigned int id = g.get_local_linear_id(); - unsigned int start_index = - id / logical_sub_group_size * logical_sub_group_size; - unsigned int target_offset = (id % logical_sub_group_size) ^ mask; - return sycl::select_from_group(g, x, - target_offset < logical_sub_group_size - ? start_index + target_offset - : id); } + unsigned int id = g.get_local_linear_id(); + unsigned int start_index = + id / logical_sub_group_size * logical_sub_group_size; + unsigned int target_offset = (id % logical_sub_group_size) ^ mask; + return sycl::select_from_group(g, x, + target_offset < logical_sub_group_size + ? start_index + target_offset + : id); } namespace experimental {