diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 866aea7b69d4e..f115d2a26606a 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -142,10 +142,16 @@ static inline constexpr bool is_pow2(uint64_t x) { return x && (x & (x - 1)) == 0; } -// Where this chunk size should start looking in the global array. -static inline constexpr uint32_t start_index(uint32_t chunk_index) { - return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) / - impl::get_chunk_id(SLAB_SIZE / 2); +// Where this chunk size should start looking in the global array. Small +// allocations are much more likely than large ones, so we give them the most +// space. We use a cubic easing function normalized on the possible chunks. +static inline constexpr uint32_t start_index(uint32_t chunk_size) { + constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2); + uint64_t norm = + (1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk; + uint64_t bias = (norm * norm * norm) >> 32; + uint64_t inv = (1 << 16) - bias; + return static_cast(((ARRAY_SIZE - 1) * inv) >> 16); } } // namespace impl @@ -487,9 +493,10 @@ static Slab *find_slab(uint32_t chunk_size) { uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED); uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size); - for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) { + for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) { uint32_t index = - !offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE; + !offset ? start + : (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE; if (slots[index].use_count() < Slab::available_chunks(chunk_size)) { uint64_t lane_mask = gpu::get_lane_mask();