Skip to content

[libc] Optimize mempcy size thresholds #70049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 7, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions libc/src/string/memory_utils/x86_64/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
if (count < 128)
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
Expand All @@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
if (count < 128)
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
Expand All @@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
if (count < 128)
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
// Aligning 'dst' on a 32B boundary.
Expand Down Expand Up @@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
if (count < 128)
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
prefetch_to_local_cache(src + kThreeCachelines);
Expand Down Expand Up @@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
#if defined(__AVX512F__)
constexpr size_t vector_size = 64;
#elif defined(__AVX__)
constexpr size_t vector_size = 32;
#elif defined(__SSE2__)
constexpr size_t vector_size = 16;
#else
constexpr size_t vector_size = 8;
#endif
if (count == 0)
return;
if (count == 1)
Expand All @@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
return builtin::Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
// If count is equal to a power of 2, we can handle it as head-tail
// of both smaller size and larger size (head-tail are either
// non-overlapping for smaller size, or completely collapsed
// for larger size). It seems to be more profitable to do the copy
// with the larger size, if it's natively supported (e.g. doing
// 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
// But it's not profitable to use larger size if it's not natively
// supported: we will both use more instructions and handle fewer
// sizes in earlier branches.
if (vector_size >= 16 ? count < 16 : count <= 16)
return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
if (vector_size >= 32 ? count < 32 : count <= 32)
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
if (vector_size >= 64 ? count < 64 : count <= 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
if constexpr (x86::kAvx) {
if constexpr (x86::kUseSoftwarePrefetching) {
Expand Down