From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 31 Oct 2023 15:42:32 -0400 Subject: [PATCH 01/14] Sw prefetch in memset (#2) * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting --- libc/src/string/CMakeLists.txt | 1 + libc/src/string/memory_utils/op_generic.h | 26 +++++++ .../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++------- .../llvm-project-overlay/libc/BUILD.bazel | 1 + 4 files changed, 78 insertions(+), 25 deletions(-) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 67675b682081c..aa69bff7a8cfa 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index fd71ca30e24b9..4063de1d5f583 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE +namespace LIBC_NAMESPACE::sw_prefetch { +// Size of a cacheline for software prefetching +static constexpr size_t kCachelineSize = 64; +// prefetch for write +static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +} // namespace LIBC_NAMESPACE::sw_prefetch + namespace LIBC_NAMESPACE::generic { // We accept three types of values as elements for generic operations: @@ -163,6 +170,25 @@ template struct Memset { } while (offset < count - SIZE); tail(dst, value, count); } + + template + LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, + size_t count) { + size_t offset = 0; + + while (offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) + sw_prefetch::PrefetchW(dst + offset + prefetch_distance + + sw_prefetch::kCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) + block(dst + offset, value); + } + while (offset + SIZE < count) { + block(dst + offset, value); + offset += SIZE; + } + tail(dst, value, count); + } }; template struct MemsetSequence { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 6436594856b0e..4834968c0b99f 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -16,9 +16,12 @@ #include // size_t namespace LIBC_NAMESPACE { +namespace x86 { +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); + +} // namespace x86 -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) using uint128_t = generic_v128; using uint256_t = generic_v256; @@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { using uint512_t = cpp::array; #endif - if (count == 0) - return; - if (count == 1) - return generic::Memset::block(dst, value); - if (count == 2) - return generic::Memset::block(dst, value); - if (count == 3) - return generic::MemsetSequence::block(dst, value); - if (count <= 8) - return generic::Memset::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset::head_tail(dst, value, count); - if (count <= 128) - return generic::Memset::head_tail(dst, value, count); - // Aligned loop - generic::Memset::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset::loop_and_tail(dst, value, count); -} + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset::loop_and_tail(dst, value, count); + } else { + return generic::Memset::loop_and_tail_prefetch<320, 128>( + dst, value, count); + } + } + + [[maybe_unused]] LIBC_INLINE static void + inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset::block(dst, value); + if (count == 2) + return generic::Memset::block(dst, value); + if (count == 3) + return generic::MemsetSequence::block(dst, value); + if (count <= 8) + return generic::Memset::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) { + return inline_memset_x86_sw_prefetching(dst, value, count); + } + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset::loop_and_tail(dst, value, count); + } } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3ae68193dccd2..dea21fd771826 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -33,6 +33,7 @@ PRINTF_COPTS = [ MEMORY_COPTS = [ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", + # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING", ] # A flag to pick which `mpfr` to use for math tests. From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 31 Oct 2023 17:05:57 -0400 Subject: [PATCH 02/14] Add software prefetch instructions to memset * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset --- libc/src/string/memory_utils/op_generic.h | 3 +-- libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 4063de1d5f583..2844501a74590 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -174,8 +174,7 @@ template struct Memset { template LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, size_t count) { - size_t offset = 0; - + size_t offset = 96; while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) sw_prefetch::PrefetchW(dst + offset + prefetch_distance + diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 4834968c0b99f..98f559bca875a 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + // Prefetch one cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); if (count <= 128) return generic::Memset::head_tail(dst, value, count); + // Prefetch the next cacheline sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); // Aligned loop generic::Memset::block(dst, value); @@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = if (count <= 192) { return generic::Memset::loop_and_tail(dst, value, count); } else { + generic::Memset::block(dst, value); + generic::Memset::block(dst + sizeof(uint512_t), value); return generic::Memset::loop_and_tail_prefetch<320, 128>( dst, value, count); } From 50ffede6c6c40f2b97eca84d57ca9765ef552fd1 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 7 Nov 2023 11:52:48 -0500 Subject: [PATCH 03/14] Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes (#4) * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset * SW Prefetching in Memset * Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes * Fix formatting --- libc/src/string/memory_utils/op_generic.h | 36 ++--- libc/src/string/memory_utils/utils.h | 6 + .../memory_utils/x86_64/inline_memcpy.h | 5 - .../memory_utils/x86_64/inline_memset.h | 132 ++++++++++-------- 4 files changed, 93 insertions(+), 86 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 2844501a74590..833ab9a6624d6 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -141,19 +141,23 @@ template struct Memset { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); - LIBC_INLINE static void block(Ptr dst, uint8_t value) { + LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) { if constexpr (is_scalar_v || is_vector_v) { - store(dst, splat(value)); + store(dst + offset, splat(value)); } else if constexpr (is_array_v) { using value_type = typename T::value_type; const auto Splat = splat(value); for (size_t I = 0; I < array_size_v; ++I) - store(dst + (I * sizeof(value_type)), Splat); + store(dst + offset + (I * sizeof(value_type)), Splat); } } + LIBC_INLINE static void block(Ptr dst, uint8_t value) { + block_offset(dst, value, 0); + } + LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); + block_offset(dst, value, count - SIZE); } LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { @@ -161,32 +165,18 @@ template struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, + size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; do { - block(dst + offset, value); + block_offset(dst, value, offset); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); } - template - LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, - size_t count) { - size_t offset = 96; - while (offset + prefetch_degree + SIZE <= count) { - for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) - sw_prefetch::PrefetchW(dst + offset + prefetch_distance + - sw_prefetch::kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) - block(dst + offset, value); - } - while (offset + SIZE < count) { - block(dst + offset, value); - offset += SIZE; - } - tail(dst, value, count); + LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + return loop_and_tail_offset(dst, value, count, 0); } }; diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 85677e51fad0e..62b3b7a0d728b 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -374,6 +374,12 @@ template struct AlignHelper { uintptr_t offset_; }; +LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); } + +LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { + __builtin_prefetch(dst, 0, 3); +} + } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h index f43230ffd8ad1..f851bcec09650 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h +++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h @@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold = } // namespace x86 -// TODO: Move to a shared header when appropriate. -[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) { - __builtin_prefetch(addr, 0, 3); -} - [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, size_t count) { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 98f559bca875a..b6d3d5a0b65cb 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -12,83 +12,99 @@ #include "src/string/memory_utils/op_generic.h" #include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" // Ptr, CPtr +#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h" #include // size_t namespace LIBC_NAMESPACE { namespace x86 { +// Size of one cache line for software prefetching +LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64; +LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2; +LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5; + LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); } // namespace x86 #if defined(__AVX512F__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = generic_v512; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = cpp::array; #elif defined(__SSE2__) - using uint128_t = generic_v128; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #else - using uint128_t = cpp::array; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = cpp::array; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #endif - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - // Prefetch one cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); - if (count <= 128) - return generic::Memset::head_tail(dst, value, count); - // Prefetch the next cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); - // Aligned loop - generic::Memset::block(dst, value); - align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return generic::Memset::loop_and_tail(dst, value, count); - } else { - generic::Memset::block(dst, value); - generic::Memset::block(dst + sizeof(uint512_t), value); - return generic::Memset::loop_and_tail_prefetch<320, 128>( - dst, value, count); +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + size_t prefetch_distance = x86::kFiveCachelinesSize; + size_t prefetch_degree = x86::kTwoCachelinesSize; + size_t SIZE = sizeof(uint256_t); + // Prefetch one cache line + prefetch_for_write(dst + x86::kOneCachelineSize); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Prefetch the second cache line + prefetch_for_write(dst + x86::kTwoCachelinesSize); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset::loop_and_tail(dst, value, count); + } else { + generic::Memset::block(dst, value); + generic::Memset::block_offset(dst, value, SIZE); + size_t offset = 96; + while (offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) + prefetch_for_write(dst + offset + prefetch_distance + + x86::kOneCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) + generic::Memset::block_offset(dst, value, offset); } + generic::Memset::loop_and_tail_offset(dst, value, count, offset); } +} + +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset::block(dst, value); + if (count == 2) + return generic::Memset::block(dst, value); + if (count == 3) + return generic::MemsetSequence::block(dst, value); + if (count <= 8) + return generic::Memset::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) + return inline_memset_x86_gt64_sw_prefetching(dst, value, count); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset::loop_and_tail(dst, value, count); +} - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86(Ptr dst, uint8_t value, size_t count) { - if (count == 0) - return; - if (count == 1) - return generic::Memset::block(dst, value); - if (count == 2) - return generic::Memset::block(dst, value); - if (count == 3) - return generic::MemsetSequence::block(dst, value); - if (count <= 8) - return generic::Memset::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset::head_tail(dst, value, count); - if constexpr (x86::kUseSoftwarePrefetchingMemset) { - return inline_memset_x86_sw_prefetching(dst, value, count); - } - if (count <= 128) - return generic::Memset::head_tail(dst, value, count); - // Aligned loop - generic::Memset::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset::loop_and_tail(dst, value, count); - } } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H From 507b82e4050a44829bba46bf802a48b6d7eafc16 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 7 Nov 2023 12:46:25 -0500 Subject: [PATCH 04/14] Remove wrong include (#5) * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset * SW Prefetching in Memset * Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes * Fix formatting * Remove wrong include From 082cdbabaf698e78d1356de8c8815d956f749d56 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 7 Nov 2023 14:01:56 -0500 Subject: [PATCH 05/14] Remove wrong include (#6) * Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset * SW Prefetching in Memset * Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes * Fix formatting * Remove wrong include From fbb1f23c8e8e026178a6b2489307dbe9097298d5 Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 7 Nov 2023 19:16:56 +0000 Subject: [PATCH 06/14] Remove wrong include --- libc/src/string/memory_utils/x86_64/inline_memset.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index b6d3d5a0b65cb..9b92cd130bc60 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -12,7 +12,6 @@ #include "src/string/memory_utils/op_generic.h" #include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" // Ptr, CPtr -#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h" #include // size_t From 9cd1f2350059cfd243c79edc95b5148b6299896c Mon Sep 17 00:00:00 2001 From: doshimili Date: Tue, 7 Nov 2023 20:27:50 +0000 Subject: [PATCH 07/14] Fix memset warmup --- libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 9b92cd130bc60..90e8104257703 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -63,7 +63,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { return generic::Memset::loop_and_tail(dst, value, count); } else { generic::Memset::block(dst, value); - generic::Memset::block_offset(dst, value, SIZE); + generic::Memset::block_offset(dst, value, sizeof(uint512_t)); size_t offset = 96; while (offset + prefetch_degree + SIZE <= count) { for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) From 45e51a3aa663765b61cac68e177da738296accea Mon Sep 17 00:00:00 2001 From: doshimili Date: Wed, 8 Nov 2023 16:33:14 +0000 Subject: [PATCH 08/14] Remove block_offset and other minor changes --- libc/src/string/memory_utils/op_generic.h | 21 +++++-------------- libc/src/string/memory_utils/utils.h | 6 ++++-- .../memory_utils/x86_64/inline_memset.h | 20 ++++++++---------- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 833ab9a6624d6..db218f8577ab5 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -48,13 +48,6 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32))); using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace LIBC_NAMESPACE -namespace LIBC_NAMESPACE::sw_prefetch { -// Size of a cacheline for software prefetching -static constexpr size_t kCachelineSize = 64; -// prefetch for write -static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); } -} // namespace LIBC_NAMESPACE::sw_prefetch - namespace LIBC_NAMESPACE::generic { // We accept three types of values as elements for generic operations: @@ -141,23 +134,19 @@ template struct Memset { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); - LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) { + LIBC_INLINE static void block(Ptr dst, uint8_t value) { if constexpr (is_scalar_v || is_vector_v) { - store(dst + offset, splat(value)); + store(dst, splat(value)); } else if constexpr (is_array_v) { using value_type = typename T::value_type; const auto Splat = splat(value); for (size_t I = 0; I < array_size_v; ++I) - store(dst + offset + (I * sizeof(value_type)), Splat); + store(dst + (I * sizeof(value_type)), Splat); } } - LIBC_INLINE static void block(Ptr dst, uint8_t value) { - block_offset(dst, value, 0); - } - LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { - block_offset(dst, value, count - SIZE); + block(dst + count - SIZE, value); } LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { @@ -169,7 +158,7 @@ template struct Memset { size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); do { - block_offset(dst, value, offset); + block(dst + offset, value); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 62b3b7a0d728b..f70880ee853d3 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -374,10 +374,12 @@ template struct AlignHelper { uintptr_t offset_; }; -LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); } +LIBC_INLINE void prefetch_for_write(CPtr dst) { + __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3); +} LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { - __builtin_prefetch(dst, 0, 3); + __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3); } } // namespace LIBC_NAMESPACE diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 90e8104257703..9b95df6633935 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -47,9 +47,9 @@ using uint512_t = cpp::array; [[maybe_unused]] LIBC_INLINE static void inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - size_t prefetch_distance = x86::kFiveCachelinesSize; - size_t prefetch_degree = x86::kTwoCachelinesSize; - size_t SIZE = sizeof(uint256_t); + constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize; + constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize; + constexpr size_t SIZE = sizeof(uint256_t); // Prefetch one cache line prefetch_for_write(dst + x86::kOneCachelineSize); if (count <= 128) @@ -62,15 +62,13 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { if (count <= 192) { return generic::Memset::loop_and_tail(dst, value, count); } else { - generic::Memset::block(dst, value); - generic::Memset::block_offset(dst, value, sizeof(uint512_t)); + generic::MemsetSequence::block(dst, value); size_t offset = 96; - while (offset + prefetch_degree + SIZE <= count) { - for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) - prefetch_for_write(dst + offset + prefetch_distance + - x86::kOneCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) - generic::Memset::block_offset(dst, value, offset); + while (offset + PREFETCH_DEGREE + SIZE <= count) { + prefetch_for_write(dst + offset + PREFETCH_DISTANCE); + prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize); + for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) + generic::Memset::block(dst + offset, value); } generic::Memset::loop_and_tail_offset(dst, value, count, offset); } From 13adbd113d47dd18de72737a27c0251b9ac98513 Mon Sep 17 00:00:00 2001 From: doshimili Date: Wed, 8 Nov 2023 16:42:38 +0000 Subject: [PATCH 09/14] Bug fixes --- libc/src/string/memory_utils/x86_64/inline_memset.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 9b95df6633935..c980a1cde7b36 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -66,7 +66,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { size_t offset = 96; while (offset + PREFETCH_DEGREE + SIZE <= count) { prefetch_for_write(dst + offset + PREFETCH_DISTANCE); - prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize); + prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize); for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) generic::Memset::block(dst + offset, value); } @@ -101,7 +101,6 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { align_to_next_boundary<32>(dst, count); return generic::Memset::loop_and_tail(dst, value, count); } - } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H From c15871bdd519490249d49c96bac3c8c158147ccf Mon Sep 17 00:00:00 2001 From: doshimili Date: Wed, 8 Nov 2023 16:59:26 +0000 Subject: [PATCH 10/14] Formatting fixes --- libc/src/string/memory_utils/x86_64/inline_memset.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index c980a1cde7b36..42559b6ffa9bf 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -66,7 +66,8 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { size_t offset = 96; while (offset + PREFETCH_DEGREE + SIZE <= count) { prefetch_for_write(dst + offset + PREFETCH_DISTANCE); - prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize); + prefetch_for_write(dst + offset + PREFETCH_DISTANCE + + x86::kOneCachelineSize); for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) generic::Memset::block(dst + offset, value); } From c08bea25eea629ea2a25b2caf483c970bbe26969 Mon Sep 17 00:00:00 2001 From: doshimili Date: Wed, 8 Nov 2023 17:20:44 +0000 Subject: [PATCH 11/14] Formatting fixes --- libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 42559b6ffa9bf..41eadf2dcc00c 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -67,7 +67,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { while (offset + PREFETCH_DEGREE + SIZE <= count) { prefetch_for_write(dst + offset + PREFETCH_DISTANCE); prefetch_for_write(dst + offset + PREFETCH_DISTANCE + - x86::kOneCachelineSize); + x86::kOneCachelineSize); for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) generic::Memset::block(dst + offset, value); } From a5ab2993207ac5bd789109f41d522ddb46216baa Mon Sep 17 00:00:00 2001 From: doshimili Date: Thu, 9 Nov 2023 15:07:42 +0000 Subject: [PATCH 12/14] Add memset option to config.json --- libc/config/config.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libc/config/config.json b/libc/config/config.json index 3c74e0ed1eddf..a7f2e5113d660 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -21,6 +21,10 @@ "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { "value": false, "doc": "Read more than a byte at a time to perform byte-string operations like strlen." + }, + "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": { + "value": false, + "doc": "Use software prefetching in memset to increase performance." } } } From 9f594586d222515a8139f98a862107e7e65d416d Mon Sep 17 00:00:00 2001 From: doshimili Date: Thu, 9 Nov 2023 15:31:14 +0000 Subject: [PATCH 13/14] Add configuration to CMakeLists --- libc/src/string/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index aa69bff7a8cfa..6daaf1998ea7b 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -3,6 +3,9 @@ add_subdirectory(memory_utils) if(LIBC_CONF_STRING_UNSAFE_WIDE_READ) list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ") endif() +if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING) + list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING") +endif() if(string_config_options) list(PREPEND string_config_options "COMPILE_OPTIONS") endif() From b7e154774d54f5c8c6c73d6db9e7c81e232499b0 Mon Sep 17 00:00:00 2001 From: doshimili Date: Thu, 9 Nov 2023 15:38:06 +0000 Subject: [PATCH 14/14] Modify docstring to describe the configuration better --- libc/config/config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/config/config.json b/libc/config/config.json index a7f2e5113d660..77d10d75f3646 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -24,7 +24,7 @@ }, "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": { "value": false, - "doc": "Use software prefetching in memset to increase performance." + "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled." } } }