From c585541e671bc76d60163356c435bfb32ef858b4 Mon Sep 17 00:00:00 2001 From: Joshua Wong <joshuawong@anticentri.st> Date: Fri, 12 Apr 2024 18:15:02 -0500 Subject: [PATCH 1/4] optimize in-place collection of `Vec` LLVM does not know that the multiplication never overflows, which causes it to generate unnecessary instructions. Use `usize::unchecked_mul`, so that it can fold the `dst_cap` calculation when `size_of::<I::SRC>() == size_of::<T>()`. Running: ``` rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm` ``` ```rust pub struct Foo([usize; 3]); pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> { v.into_iter().map(|f| f.0).collect() } ``` Before this commit: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24 %dst_cap.i.i = udiv i64 %_16.i.i, 24 store i64 %dst_cap.i.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8 ret void } ``` After: ``` define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) { start: %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8 %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8 %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8 %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16 %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8 %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24 %0 = udiv i64 %_19.i.idx, 24 store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8 %1 = getelementptr inbounds i8, ptr %_0, i64 8 store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8 %2 = getelementptr inbounds i8, ptr %_0, i64 16 store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14 ret void } ``` Note that there is still one more `mul,udiv` pair that I couldn't get rid of. The root cause is the same issue as #121239, the `nuw` gets stripped off of `ptr::sub_ptr`. --- library/alloc/src/vec/in_place_collect.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/library/alloc/src/vec/in_place_collect.rs b/library/alloc/src/vec/in_place_collect.rs index 88aa1b1b0e081..5b0fde93b91c6 100644 --- a/library/alloc/src/vec/in_place_collect.rs +++ b/library/alloc/src/vec/in_place_collect.rs @@ -259,7 +259,8 @@ where inner.cap, inner.buf.cast::<T>(), inner.end as *const T, - inner.cap * mem::size_of::<I::Src>() / mem::size_of::<T>(), + // SAFETY: the multiplication can not overflow, since `inner.cap * size_of::<I::SRC>()` is the size of the allocation. + inner.cap.unchecked_mul(mem::size_of::<I::Src>()) / mem::size_of::<T>(), ) }; @@ -373,8 +374,10 @@ where // - unlike most internal iteration methods, it only takes a &mut self // - it lets us thread the write pointer through its innards and get it back in the end let sink = InPlaceDrop { inner: dst_buf, dst: dst_buf }; - let sink = - self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)).unwrap(); + let sink = match self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)) { + Ok(sink) => sink, + Err(never) => match never {}, + }; // iteration succeeded, don't drop head unsafe { ManuallyDrop::new(sink).dst.sub_ptr(dst_buf) } } From 6165dca6dbf66d45a373e08f16d1fc85f7be0ac7 Mon Sep 17 00:00:00 2001 From: Joshua Wong <joshuawong@anticentri.st> Date: Fri, 12 Apr 2024 18:36:31 -0500 Subject: [PATCH 2/4] optimize in_place_collect with vec::IntoIter::try_fold `Iterator::try_fold` gets called on the underlying Iterator in `SpecInPlaceCollect::collect_in_place` whenever it does not implement `TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't tell that the drop can never occur, when using the default `Iterator::try_fold` implementation. For example, the asm from the `unwrap_clone` method is currently: ``` unwrap_clone: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov rbx, rdi mov r12, qword ptr [rsi] mov rdi, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 16] movabs rsi, -6148914691236517205 mov r14, r12 test rax, rax je .LBB0_10 lea rcx, [rax + 2*rax] lea r14, [r12 + 8*rcx] shl rax, 3 lea rax, [rax + 2*rax] xor ecx, ecx .LBB0_2: cmp qword ptr [r12 + rcx], 0 je .LBB0_4 add rcx, 24 cmp rax, rcx jne .LBB0_2 jmp .LBB0_10 .LBB0_4: lea rdx, [rax - 24] lea r14, [r12 + rcx] cmp rdx, rcx je .LBB0_10 mov qword ptr [rsp], rdi sub rax, rcx add rax, -24 mul rsi mov r15, rdx lea rbp, [r12 + rcx] add rbp, 32 shr r15, 4 mov r13, qword ptr [rip + __rust_dealloc@GOTPCREL] jmp .LBB0_6 .LBB0_8: add rbp, 24 dec r15 je .LBB0_9 .LBB0_6: mov rsi, qword ptr [rbp] test rsi, rsi je .LBB0_8 mov rdi, qword ptr [rbp - 8] mov edx, 1 call r13 jmp .LBB0_8 .LBB0_9: mov rdi, qword ptr [rsp] movabs rsi, -6148914691236517205 .LBB0_10: sub r14, r12 mov rax, r14 mul rsi shr rdx, 4 mov qword ptr [rbx], r12 mov qword ptr [rbx + 8], rdi mov qword ptr [rbx + 16], rdx mov rax, rbx add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret ``` After this PR: ``` unwrap_clone: mov rax, rdi movups xmm0, xmmword ptr [rsi] mov rcx, qword ptr [rsi + 16] movups xmmword ptr [rdi], xmm0 mov qword ptr [rdi + 16], rcx ret ``` Fixes #120493 --- library/alloc/src/vec/into_iter.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs index b0226c848332c..280821434027d 100644 --- a/library/alloc/src/vec/into_iter.rs +++ b/library/alloc/src/vec/into_iter.rs @@ -289,6 +289,35 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { }; } + fn try_fold<B, F, R>(&mut self, init: B, mut f: F) -> R + where + Self: Sized, + F: FnMut(B, Self::Item) -> R, + R: core::ops::Try<Output = B>, + { + let mut accum = init; + if T::IS_ZST { + while self.ptr.as_ptr() != self.end.cast_mut() { + // SAFETY: we just checked that `self.ptr` is in bounds. + let tmp = unsafe { self.ptr.read() }; + // See `next` for why we subtract from `end` here. + self.end = self.end.wrapping_byte_sub(1); + accum = f(accum, tmp)?; + } + } else { + // SAFETY: `self.end` can only be null if `T` is a ZST. + while self.ptr != non_null!(self.end, T) { + // SAFETY: we just checked that `self.ptr` is in bounds. + let tmp = unsafe { self.ptr.read() }; + // SAFETY: the maximum this can be is `self.end`. + // Increment `self.ptr` first to avoid double dropping in the event of a panic. + self.ptr = unsafe { self.ptr.add(1) }; + accum = f(accum, tmp)?; + } + } + R::from_output(accum) + } + unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item where Self: TrustedRandomAccessNoCoerce, From 9d6b93c3e6609aa104da23cb9091c6c2e277f71e Mon Sep 17 00:00:00 2001 From: Joshua Wong <joshuawong@anticentri.st> Date: Sun, 12 May 2024 19:56:03 -0500 Subject: [PATCH 3/4] specialize `Iterator::fold` for `vec::IntoIter` LLVM currently adds a redundant check for the returned option, in addition to the `self.ptr != self.end` check when using the default `Iterator::fold` method that calls `vec::IntoIter::next` in a loop. --- library/alloc/src/vec/into_iter.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs index 280821434027d..c47989337708f 100644 --- a/library/alloc/src/vec/into_iter.rs +++ b/library/alloc/src/vec/into_iter.rs @@ -289,13 +289,38 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { }; } - fn try_fold<B, F, R>(&mut self, init: B, mut f: F) -> R + fn fold<B, F>(mut self, mut accum: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + if T::IS_ZST { + while self.ptr.as_ptr() != self.end.cast_mut() { + // SAFETY: we just checked that `self.ptr` is in bounds. + let tmp = unsafe { self.ptr.read() }; + // See `next` for why we subtract from `end` here. + self.end = self.end.wrapping_byte_sub(1); + accum = f(accum, tmp); + } + } else { + // SAFETY: `self.end` can only be null if `T` is a ZST. + while self.ptr != non_null!(self.end, T) { + // SAFETY: we just checked that `self.ptr` is in bounds. + let tmp = unsafe { self.ptr.read() }; + // SAFETY: the maximum this can be is `self.end`. + // Increment `self.ptr` first to avoid double dropping in the event of a panic. + self.ptr = unsafe { self.ptr.add(1) }; + accum = f(accum, tmp); + } + } + accum + } + + fn try_fold<B, F, R>(&mut self, mut accum: B, mut f: F) -> R where Self: Sized, F: FnMut(B, Self::Item) -> R, R: core::ops::Try<Output = B>, { - let mut accum = init; if T::IS_ZST { while self.ptr.as_ptr() != self.end.cast_mut() { // SAFETY: we just checked that `self.ptr` is in bounds. From 65e302fc3648c2268fecbff10a861f5bd3e0f67d Mon Sep 17 00:00:00 2001 From: Joshua Wong <joshuawong@anticentri.st> Date: Sat, 18 May 2024 19:15:21 -0500 Subject: [PATCH 4/4] use `Result::into_ok` on infallible result. --- library/alloc/src/lib.rs | 1 + library/alloc/src/vec/in_place_collect.rs | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 91b83cfe011f2..4ac0c9b15be7a 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -160,6 +160,7 @@ #![feature(tuple_trait)] #![feature(unicode_internals)] #![feature(unsize)] +#![feature(unwrap_infallible)] #![feature(vec_pop_if)] // tidy-alphabetical-end // diff --git a/library/alloc/src/vec/in_place_collect.rs b/library/alloc/src/vec/in_place_collect.rs index 5b0fde93b91c6..22541a2b9d82f 100644 --- a/library/alloc/src/vec/in_place_collect.rs +++ b/library/alloc/src/vec/in_place_collect.rs @@ -374,10 +374,8 @@ where // - unlike most internal iteration methods, it only takes a &mut self // - it lets us thread the write pointer through its innards and get it back in the end let sink = InPlaceDrop { inner: dst_buf, dst: dst_buf }; - let sink = match self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)) { - Ok(sink) => sink, - Err(never) => match never {}, - }; + let sink = + self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)).into_ok(); // iteration succeeded, don't drop head unsafe { ManuallyDrop::new(sink).dst.sub_ptr(dst_buf) } }