diff --git a/libc/config/config.json b/libc/config/config.json index 3c74e0ed1eddf..77d10d75f3646 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -21,6 +21,10 @@ "LIBC_CONF_STRING_UNSAFE_WIDE_READ": { "value": false, "doc": "Read more than a byte at a time to perform byte-string operations like strlen." + }, + "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": { + "value": false, + "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled." } } } diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 67675b682081c..6daaf1998ea7b 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -3,6 +3,9 @@ add_subdirectory(memory_utils) if(LIBC_CONF_STRING_UNSAFE_WIDE_READ) list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ") endif() +if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING) + list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING") +endif() if(string_config_options) list(PREPEND string_config_options "COMPILE_OPTIONS") endif() @@ -656,6 +659,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING) add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memset(memset) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index fd71ca30e24b9..db218f8577ab5 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -154,15 +154,19 @@ template struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, + size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; do { block(dst + offset, value); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); } + + LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + return loop_and_tail_offset(dst, value, count, 0); + } }; template struct MemsetSequence { diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 85677e51fad0e..f70880ee853d3 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -374,6 +374,14 @@ template struct AlignHelper { uintptr_t offset_; }; +LIBC_INLINE void prefetch_for_write(CPtr dst) { + __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3); +} + +LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { + __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3); +} + } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h index f43230ffd8ad1..f851bcec09650 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h +++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h @@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold = } // namespace x86 -// TODO: Move to a shared header when appropriate. -[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) { - __builtin_prefetch(addr, 0, 3); -} - [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, size_t count) { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 6436594856b0e..41eadf2dcc00c 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -16,27 +16,67 @@ #include // size_t namespace LIBC_NAMESPACE { +namespace x86 { +// Size of one cache line for software prefetching +LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64; +LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2; +LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5; + +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); + +} // namespace x86 -[[maybe_unused]] LIBC_INLINE static void -inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = generic_v512; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = cpp::array; #elif defined(__SSE2__) - using uint128_t = generic_v128; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #else - using uint128_t = cpp::array; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = cpp::array; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #endif +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize; + constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize; + constexpr size_t SIZE = sizeof(uint256_t); + // Prefetch one cache line + prefetch_for_write(dst + x86::kOneCachelineSize); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Prefetch the second cache line + prefetch_for_write(dst + x86::kTwoCachelinesSize); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset::loop_and_tail(dst, value, count); + } else { + generic::MemsetSequence::block(dst, value); + size_t offset = 96; + while (offset + PREFETCH_DEGREE + SIZE <= count) { + prefetch_for_write(dst + offset + PREFETCH_DISTANCE); + prefetch_for_write(dst + offset + PREFETCH_DISTANCE + + x86::kOneCachelineSize); + for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) + generic::Memset::block(dst + offset, value); + } + generic::Memset::loop_and_tail_offset(dst, value, count, offset); + } +} + +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86(Ptr dst, uint8_t value, size_t count) { if (count == 0) return; if (count == 1) @@ -53,6 +93,8 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) { return generic::Memset::head_tail(dst, value, count); if (count <= 64) return generic::Memset::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) + return inline_memset_x86_gt64_sw_prefetching(dst, value, count); if (count <= 128) return generic::Memset::head_tail(dst, value, count); // Aligned loop diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 3ae68193dccd2..dea21fd771826 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -33,6 +33,7 @@ PRINTF_COPTS = [ MEMORY_COPTS = [ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", + # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING", ] # A flag to pick which `mpfr` to use for math tests.