Skip to content

Commit 50ffede

Browse files
authored
Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes (#4)
* Add software prefetching to memset * Add software prefetching to memset * Fix formatting * Fix build errors * Fix build errors * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Fix formatting * Add warmup to memset * SW Prefetching in Memset * Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes * Fix formatting
1 parent 01be692 commit 50ffede

File tree

4 files changed

+93
-86
lines changed

4 files changed

+93
-86
lines changed

libc/src/string/memory_utils/op_generic.h

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -141,52 +141,42 @@ template <typename T> struct Memset {
141141
static_assert(is_element_type_v<T>);
142142
static constexpr size_t SIZE = sizeof(T);
143143

144-
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
144+
LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
145145
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
146-
store<T>(dst, splat<T>(value));
146+
store<T>(dst + offset, splat<T>(value));
147147
} else if constexpr (is_array_v<T>) {
148148
using value_type = typename T::value_type;
149149
const auto Splat = splat<value_type>(value);
150150
for (size_t I = 0; I < array_size_v<T>; ++I)
151-
store<value_type>(dst + (I * sizeof(value_type)), Splat);
151+
store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
152152
}
153153
}
154154

155+
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
156+
block_offset(dst, value, 0);
157+
}
158+
155159
LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
156-
block(dst + count - SIZE, value);
160+
block_offset(dst, value, count - SIZE);
157161
}
158162

159163
LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
160164
block(dst, value);
161165
tail(dst, value, count);
162166
}
163167

164-
LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
168+
LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
169+
size_t count, size_t offset) {
165170
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
166-
size_t offset = 0;
167171
do {
168-
block(dst + offset, value);
172+
block_offset(dst, value, offset);
169173
offset += SIZE;
170174
} while (offset < count - SIZE);
171175
tail(dst, value, count);
172176
}
173177

174-
template <size_t prefetch_distance, size_t prefetch_degree>
175-
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
176-
size_t count) {
177-
size_t offset = 96;
178-
while (offset + prefetch_degree + SIZE <= count) {
179-
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
180-
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
181-
sw_prefetch::kCachelineSize * i);
182-
for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
183-
block(dst + offset, value);
184-
}
185-
while (offset + SIZE < count) {
186-
block(dst + offset, value);
187-
offset += SIZE;
188-
}
189-
tail(dst, value, count);
178+
LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
179+
return loop_and_tail_offset(dst, value, count, 0);
190180
}
191181
};
192182

libc/src/string/memory_utils/utils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper {
374374
uintptr_t offset_;
375375
};
376376

377+
LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
378+
379+
LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
380+
__builtin_prefetch(dst, 0, 3);
381+
}
382+
377383
} // namespace LIBC_NAMESPACE
378384

379385
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H

libc/src/string/memory_utils/x86_64/inline_memcpy.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
4747

4848
} // namespace x86
4949

50-
// TODO: Move to a shared header when appropriate.
51-
[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
52-
__builtin_prefetch(addr, 0, 3);
53-
}
54-
5550
[[maybe_unused]] LIBC_INLINE void
5651
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
5752
size_t count) {

libc/src/string/memory_utils/x86_64/inline_memset.h

Lines changed: 74 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -12,83 +12,99 @@
1212
#include "src/string/memory_utils/op_generic.h"
1313
#include "src/string/memory_utils/op_x86.h"
1414
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
15+
#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
1516

1617
#include <stddef.h> // size_t
1718

1819
namespace LIBC_NAMESPACE {
1920
namespace x86 {
21+
// Size of one cache line for software prefetching
22+
LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
23+
LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
24+
LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
25+
2026
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
2127
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
2228

2329
} // namespace x86
2430

2531
#if defined(__AVX512F__)
26-
using uint128_t = generic_v128;
27-
using uint256_t = generic_v256;
28-
using uint512_t = generic_v512;
32+
using uint128_t = generic_v128;
33+
using uint256_t = generic_v256;
34+
using uint512_t = generic_v512;
2935
#elif defined(__AVX__)
30-
using uint128_t = generic_v128;
31-
using uint256_t = generic_v256;
32-
using uint512_t = cpp::array<generic_v256, 2>;
36+
using uint128_t = generic_v128;
37+
using uint256_t = generic_v256;
38+
using uint512_t = cpp::array<generic_v256, 2>;
3339
#elif defined(__SSE2__)
34-
using uint128_t = generic_v128;
35-
using uint256_t = cpp::array<generic_v128, 2>;
36-
using uint512_t = cpp::array<generic_v128, 4>;
40+
using uint128_t = generic_v128;
41+
using uint256_t = cpp::array<generic_v128, 2>;
42+
using uint512_t = cpp::array<generic_v128, 4>;
3743
#else
38-
using uint128_t = cpp::array<uint64_t, 2>;
39-
using uint256_t = cpp::array<uint64_t, 4>;
40-
using uint512_t = cpp::array<uint64_t, 8>;
44+
using uint128_t = cpp::array<uint64_t, 2>;
45+
using uint256_t = cpp::array<uint64_t, 4>;
46+
using uint512_t = cpp::array<uint64_t, 8>;
4147
#endif
4248

43-
[[maybe_unused]] LIBC_INLINE static void
44-
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
45-
// Prefetch one cacheline
46-
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
47-
if (count <= 128)
48-
return generic::Memset<uint512_t>::head_tail(dst, value, count);
49-
// Prefetch the next cacheline
50-
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
51-
// Aligned loop
52-
generic::Memset<uint256_t>::block(dst, value);
53-
align_to_next_boundary<32>(dst, count);
54-
if (count <= 192) {
55-
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
56-
} else {
57-
generic::Memset<uint512_t>::block(dst, value);
58-
generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
59-
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
60-
dst, value, count);
49+
[[maybe_unused]] LIBC_INLINE static void
50+
inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
51+
size_t prefetch_distance = x86::kFiveCachelinesSize;
52+
size_t prefetch_degree = x86::kTwoCachelinesSize;
53+
size_t SIZE = sizeof(uint256_t);
54+
// Prefetch one cache line
55+
prefetch_for_write(dst + x86::kOneCachelineSize);
56+
if (count <= 128)
57+
return generic::Memset<uint512_t>::head_tail(dst, value, count);
58+
// Prefetch the second cache line
59+
prefetch_for_write(dst + x86::kTwoCachelinesSize);
60+
// Aligned loop
61+
generic::Memset<uint256_t>::block(dst, value);
62+
align_to_next_boundary<32>(dst, count);
63+
if (count <= 192) {
64+
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
65+
} else {
66+
generic::Memset<uint512_t>::block(dst, value);
67+
generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
68+
size_t offset = 96;
69+
while (offset + prefetch_degree + SIZE <= count) {
70+
for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
71+
prefetch_for_write(dst + offset + prefetch_distance +
72+
x86::kOneCachelineSize * i);
73+
for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
74+
generic::Memset<uint256_t>::block_offset(dst, value, offset);
6175
}
76+
generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
6277
}
78+
}
79+
80+
[[maybe_unused]] LIBC_INLINE static void
81+
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
82+
if (count == 0)
83+
return;
84+
if (count == 1)
85+
return generic::Memset<uint8_t>::block(dst, value);
86+
if (count == 2)
87+
return generic::Memset<uint16_t>::block(dst, value);
88+
if (count == 3)
89+
return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
90+
if (count <= 8)
91+
return generic::Memset<uint32_t>::head_tail(dst, value, count);
92+
if (count <= 16)
93+
return generic::Memset<uint64_t>::head_tail(dst, value, count);
94+
if (count <= 32)
95+
return generic::Memset<uint128_t>::head_tail(dst, value, count);
96+
if (count <= 64)
97+
return generic::Memset<uint256_t>::head_tail(dst, value, count);
98+
if constexpr (x86::kUseSoftwarePrefetchingMemset)
99+
return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
100+
if (count <= 128)
101+
return generic::Memset<uint512_t>::head_tail(dst, value, count);
102+
// Aligned loop
103+
generic::Memset<uint256_t>::block(dst, value);
104+
align_to_next_boundary<32>(dst, count);
105+
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
106+
}
63107

64-
[[maybe_unused]] LIBC_INLINE static void
65-
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
66-
if (count == 0)
67-
return;
68-
if (count == 1)
69-
return generic::Memset<uint8_t>::block(dst, value);
70-
if (count == 2)
71-
return generic::Memset<uint16_t>::block(dst, value);
72-
if (count == 3)
73-
return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
74-
if (count <= 8)
75-
return generic::Memset<uint32_t>::head_tail(dst, value, count);
76-
if (count <= 16)
77-
return generic::Memset<uint64_t>::head_tail(dst, value, count);
78-
if (count <= 32)
79-
return generic::Memset<uint128_t>::head_tail(dst, value, count);
80-
if (count <= 64)
81-
return generic::Memset<uint256_t>::head_tail(dst, value, count);
82-
if constexpr (x86::kUseSoftwarePrefetchingMemset) {
83-
return inline_memset_x86_sw_prefetching(dst, value, count);
84-
}
85-
if (count <= 128)
86-
return generic::Memset<uint512_t>::head_tail(dst, value, count);
87-
// Aligned loop
88-
generic::Memset<uint256_t>::block(dst, value);
89-
align_to_next_boundary<32>(dst, count);
90-
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
91-
}
92108
} // namespace LIBC_NAMESPACE
93109

94110
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H

0 commit comments

Comments
 (0)