diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h index 2844501a74590..833ab9a6624d6 100644 --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -141,19 +141,23 @@ template struct Memset { static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); - LIBC_INLINE static void block(Ptr dst, uint8_t value) { + LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) { if constexpr (is_scalar_v || is_vector_v) { - store(dst, splat(value)); + store(dst + offset, splat(value)); } else if constexpr (is_array_v) { using value_type = typename T::value_type; const auto Splat = splat(value); for (size_t I = 0; I < array_size_v; ++I) - store(dst + (I * sizeof(value_type)), Splat); + store(dst + offset + (I * sizeof(value_type)), Splat); } } + LIBC_INLINE static void block(Ptr dst, uint8_t value) { + block_offset(dst, value, 0); + } + LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { - block(dst + count - SIZE, value); + block_offset(dst, value, count - SIZE); } LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { @@ -161,32 +165,18 @@ template struct Memset { tail(dst, value, count); } - LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, + size_t count, size_t offset) { static_assert(SIZE > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; do { - block(dst + offset, value); + block_offset(dst, value, offset); offset += SIZE; } while (offset < count - SIZE); tail(dst, value, count); } - template - LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value, - size_t count) { - size_t offset = 96; - while (offset + prefetch_degree + SIZE <= count) { - for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i) - sw_prefetch::PrefetchW(dst + offset + prefetch_distance + - sw_prefetch::kCachelineSize * i); - for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) - block(dst + offset, value); - } - while (offset + SIZE < count) { - block(dst + offset, value); - offset += SIZE; - } - tail(dst, value, count); + LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + return loop_and_tail_offset(dst, value, count, 0); } }; diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 85677e51fad0e..62b3b7a0d728b 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -374,6 +374,12 @@ template struct AlignHelper { uintptr_t offset_; }; +LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); } + +LIBC_INLINE void prefetch_to_local_cache(CPtr dst) { + __builtin_prefetch(dst, 0, 3); +} + } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h index f43230ffd8ad1..f851bcec09650 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h +++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h @@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold = } // namespace x86 -// TODO: Move to a shared header when appropriate. -[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) { - __builtin_prefetch(addr, 0, 3); -} - [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, size_t count) { diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h index 98f559bca875a..9b92cd130bc60 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memset.h +++ b/libc/src/string/memory_utils/x86_64/inline_memset.h @@ -17,78 +17,93 @@ namespace LIBC_NAMESPACE { namespace x86 { +// Size of one cache line for software prefetching +LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64; +LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2; +LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5; + LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset = LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); } // namespace x86 #if defined(__AVX512F__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = generic_v512; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = generic_v128; - using uint256_t = generic_v256; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = generic_v256; +using uint512_t = cpp::array; #elif defined(__SSE2__) - using uint128_t = generic_v128; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = generic_v128; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #else - using uint128_t = cpp::array; - using uint256_t = cpp::array; - using uint512_t = cpp::array; +using uint128_t = cpp::array; +using uint256_t = cpp::array; +using uint512_t = cpp::array; #endif - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) { - // Prefetch one cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize); - if (count <= 128) - return generic::Memset::head_tail(dst, value, count); - // Prefetch the next cacheline - sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2); - // Aligned loop - generic::Memset::block(dst, value); - align_to_next_boundary<32>(dst, count); - if (count <= 192) { - return generic::Memset::loop_and_tail(dst, value, count); - } else { - generic::Memset::block(dst, value); - generic::Memset::block(dst + sizeof(uint512_t), value); - return generic::Memset::loop_and_tail_prefetch<320, 128>( - dst, value, count); +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { + size_t prefetch_distance = x86::kFiveCachelinesSize; + size_t prefetch_degree = x86::kTwoCachelinesSize; + size_t SIZE = sizeof(uint256_t); + // Prefetch one cache line + prefetch_for_write(dst + x86::kOneCachelineSize); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Prefetch the second cache line + prefetch_for_write(dst + x86::kTwoCachelinesSize); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + if (count <= 192) { + return generic::Memset::loop_and_tail(dst, value, count); + } else { + generic::Memset::block(dst, value); + generic::Memset::block_offset(dst, value, SIZE); + size_t offset = 96; + while (offset + prefetch_degree + SIZE <= count) { + for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i) + prefetch_for_write(dst + offset + prefetch_distance + + x86::kOneCachelineSize * i); + for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE) + generic::Memset::block_offset(dst, value, offset); } + generic::Memset::loop_and_tail_offset(dst, value, count, offset); } +} + +[[maybe_unused]] LIBC_INLINE static void +inline_memset_x86(Ptr dst, uint8_t value, size_t count) { + if (count == 0) + return; + if (count == 1) + return generic::Memset::block(dst, value); + if (count == 2) + return generic::Memset::block(dst, value); + if (count == 3) + return generic::MemsetSequence::block(dst, value); + if (count <= 8) + return generic::Memset::head_tail(dst, value, count); + if (count <= 16) + return generic::Memset::head_tail(dst, value, count); + if (count <= 32) + return generic::Memset::head_tail(dst, value, count); + if (count <= 64) + return generic::Memset::head_tail(dst, value, count); + if constexpr (x86::kUseSoftwarePrefetchingMemset) + return inline_memset_x86_gt64_sw_prefetching(dst, value, count); + if (count <= 128) + return generic::Memset::head_tail(dst, value, count); + // Aligned loop + generic::Memset::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset::loop_and_tail(dst, value, count); +} - [[maybe_unused]] LIBC_INLINE static void - inline_memset_x86(Ptr dst, uint8_t value, size_t count) { - if (count == 0) - return; - if (count == 1) - return generic::Memset::block(dst, value); - if (count == 2) - return generic::Memset::block(dst, value); - if (count == 3) - return generic::MemsetSequence::block(dst, value); - if (count <= 8) - return generic::Memset::head_tail(dst, value, count); - if (count <= 16) - return generic::Memset::head_tail(dst, value, count); - if (count <= 32) - return generic::Memset::head_tail(dst, value, count); - if (count <= 64) - return generic::Memset::head_tail(dst, value, count); - if constexpr (x86::kUseSoftwarePrefetchingMemset) { - return inline_memset_x86_sw_prefetching(dst, value, count); - } - if (count <= 128) - return generic::Memset::head_tail(dst, value, count); - // Aligned loop - generic::Memset::block(dst, value); - align_to_next_boundary<32>(dst, count); - return generic::Memset::loop_and_tail(dst, value, count); - } } // namespace LIBC_NAMESPACE #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H