From c585541e671bc76d60163356c435bfb32ef858b4 Mon Sep 17 00:00:00 2001
From: Joshua Wong <joshuawong@anticentri.st>
Date: Fri, 12 Apr 2024 18:15:02 -0500
Subject: [PATCH 1/4] optimize in-place collection of `Vec`

LLVM does not know that the multiplication never overflows, which causes
it to generate unnecessary instructions. Use `usize::unchecked_mul`, so
that it can fold the `dst_cap` calculation when `size_of::<I::SRC>() ==
size_of::<T>()`.

Running:

```
rustc -C llvm-args=-x86-asm-syntax=intel -O src/lib.rs --emit asm`
```

```rust

pub struct Foo([usize; 3]);

pub fn unwrap_copy(v: Vec<Foo>) -> Vec<[usize; 3]> {
    v.into_iter().map(|f| f.0).collect()
}
```

Before this commit:

```
define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) {
start:
  %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8
  %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8
  %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8
  %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16
  %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8
  %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24
  %0 = udiv i64 %_19.i.idx, 24
  %_16.i.i = mul i64 %me.sroa.0.0.copyload.i, 24
  %dst_cap.i.i = udiv i64 %_16.i.i, 24
  store i64 %dst_cap.i.i, ptr %_0, align 8
  %1 = getelementptr inbounds i8, ptr %_0, i64 8
  store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8
  %2 = getelementptr inbounds i8, ptr %_0, i64 16
  store i64 %0, ptr %2, align 8
  ret void
}
```

After:

```
define void @unwrap_copy(ptr noalias nocapture noundef writeonly sret([24 x i8]) align 8 dereferenceable(24) %_0, ptr noalias nocapture noundef readonly align 8 dereferenceable(24) %iter) {
start:
  %me.sroa.0.0.copyload.i = load i64, ptr %iter, align 8
  %me.sroa.4.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 8
  %me.sroa.4.0.copyload.i = load ptr, ptr %me.sroa.4.0.self.sroa_idx.i, align 8
  %me.sroa.5.0.self.sroa_idx.i = getelementptr inbounds i8, ptr %iter, i64 16
  %me.sroa.5.0.copyload.i = load i64, ptr %me.sroa.5.0.self.sroa_idx.i, align 8
  %_19.i.idx = mul nsw i64 %me.sroa.5.0.copyload.i, 24
  %0 = udiv i64 %_19.i.idx, 24
  store i64 %me.sroa.0.0.copyload.i, ptr %_0, align 8
  %1 = getelementptr inbounds i8, ptr %_0, i64 8
  store ptr %me.sroa.4.0.copyload.i, ptr %1, align 8
  %2 = getelementptr inbounds i8, ptr %_0, i64 16
  store i64 %0, ptr %2, align 8, !alias.scope !9, !noalias !14
  ret void
}
```

Note that there is still one more `mul,udiv` pair that I couldn't get
rid of. The root cause is the same issue as #121239, the `nuw` gets
stripped off of `ptr::sub_ptr`.
---
 library/alloc/src/vec/in_place_collect.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/library/alloc/src/vec/in_place_collect.rs b/library/alloc/src/vec/in_place_collect.rs
index 88aa1b1b0e081..5b0fde93b91c6 100644
--- a/library/alloc/src/vec/in_place_collect.rs
+++ b/library/alloc/src/vec/in_place_collect.rs
@@ -259,7 +259,8 @@ where
             inner.cap,
             inner.buf.cast::<T>(),
             inner.end as *const T,
-            inner.cap * mem::size_of::<I::Src>() / mem::size_of::<T>(),
+            // SAFETY: the multiplication can not overflow, since `inner.cap * size_of::<I::SRC>()` is the size of the allocation.
+            inner.cap.unchecked_mul(mem::size_of::<I::Src>()) / mem::size_of::<T>(),
         )
     };
 
@@ -373,8 +374,10 @@ where
         // - unlike most internal iteration methods, it only takes a &mut self
         // - it lets us thread the write pointer through its innards and get it back in the end
         let sink = InPlaceDrop { inner: dst_buf, dst: dst_buf };
-        let sink =
-            self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)).unwrap();
+        let sink = match self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)) {
+            Ok(sink) => sink,
+            Err(never) => match never {},
+        };
         // iteration succeeded, don't drop head
         unsafe { ManuallyDrop::new(sink).dst.sub_ptr(dst_buf) }
     }

From 6165dca6dbf66d45a373e08f16d1fc85f7be0ac7 Mon Sep 17 00:00:00 2001
From: Joshua Wong <joshuawong@anticentri.st>
Date: Fri, 12 Apr 2024 18:36:31 -0500
Subject: [PATCH 2/4] optimize in_place_collect with vec::IntoIter::try_fold

`Iterator::try_fold` gets called on the underlying Iterator in
`SpecInPlaceCollect::collect_in_place` whenever it does not implement
`TrustedRandomAccess`. For types that impl `Drop`, LLVM currently can't
tell that the drop can never occur, when using the default
`Iterator::try_fold` implementation.

For example, the asm from the `unwrap_clone` method is currently:

```
unwrap_clone:
        push    rbp
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        push    rax
        mov     rbx, rdi
        mov     r12, qword ptr [rsi]
        mov     rdi, qword ptr [rsi + 8]
        mov     rax, qword ptr [rsi + 16]
        movabs  rsi, -6148914691236517205
        mov     r14, r12
        test    rax, rax
        je      .LBB0_10
        lea     rcx, [rax + 2*rax]
        lea     r14, [r12 + 8*rcx]
        shl     rax, 3
        lea     rax, [rax + 2*rax]
        xor     ecx, ecx
.LBB0_2:
        cmp     qword ptr [r12 + rcx], 0
        je      .LBB0_4
        add     rcx, 24
        cmp     rax, rcx
        jne     .LBB0_2
        jmp     .LBB0_10
.LBB0_4:
        lea     rdx, [rax - 24]
        lea     r14, [r12 + rcx]
        cmp     rdx, rcx
        je      .LBB0_10
        mov     qword ptr [rsp], rdi
        sub     rax, rcx
        add     rax, -24
        mul     rsi
        mov     r15, rdx
        lea     rbp, [r12 + rcx]
        add     rbp, 32
        shr     r15, 4
        mov     r13, qword ptr [rip + __rust_dealloc@GOTPCREL]
        jmp     .LBB0_6
.LBB0_8:
        add     rbp, 24
        dec     r15
        je      .LBB0_9
.LBB0_6:
        mov     rsi, qword ptr [rbp]
        test    rsi, rsi
        je      .LBB0_8
        mov     rdi, qword ptr [rbp - 8]
        mov     edx, 1
        call    r13
        jmp     .LBB0_8
.LBB0_9:
        mov     rdi, qword ptr [rsp]
        movabs  rsi, -6148914691236517205
.LBB0_10:
        sub     r14, r12
        mov     rax, r14
        mul     rsi
        shr     rdx, 4
        mov     qword ptr [rbx], r12
        mov     qword ptr [rbx + 8], rdi
        mov     qword ptr [rbx + 16], rdx
        mov     rax, rbx
        add     rsp, 8
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
```

After this PR:

```
unwrap_clone:
	mov	rax, rdi
	movups	xmm0, xmmword ptr [rsi]
	mov	rcx, qword ptr [rsi + 16]
	movups	xmmword ptr [rdi], xmm0
	mov	qword ptr [rdi + 16], rcx
	ret
```

Fixes #120493
---
 library/alloc/src/vec/into_iter.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs
index b0226c848332c..280821434027d 100644
--- a/library/alloc/src/vec/into_iter.rs
+++ b/library/alloc/src/vec/into_iter.rs
@@ -289,6 +289,35 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> {
         };
     }
 
+    fn try_fold<B, F, R>(&mut self, init: B, mut f: F) -> R
+    where
+        Self: Sized,
+        F: FnMut(B, Self::Item) -> R,
+        R: core::ops::Try<Output = B>,
+    {
+        let mut accum = init;
+        if T::IS_ZST {
+            while self.ptr.as_ptr() != self.end.cast_mut() {
+                // SAFETY: we just checked that `self.ptr` is in bounds.
+                let tmp = unsafe { self.ptr.read() };
+                // See `next` for why we subtract from `end` here.
+                self.end = self.end.wrapping_byte_sub(1);
+                accum = f(accum, tmp)?;
+            }
+        } else {
+            // SAFETY: `self.end` can only be null if `T` is a ZST.
+            while self.ptr != non_null!(self.end, T) {
+                // SAFETY: we just checked that `self.ptr` is in bounds.
+                let tmp = unsafe { self.ptr.read() };
+                // SAFETY: the maximum this can be is `self.end`.
+                // Increment `self.ptr` first to avoid double dropping in the event of a panic.
+                self.ptr = unsafe { self.ptr.add(1) };
+                accum = f(accum, tmp)?;
+            }
+        }
+        R::from_output(accum)
+    }
+
     unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item
     where
         Self: TrustedRandomAccessNoCoerce,

From 9d6b93c3e6609aa104da23cb9091c6c2e277f71e Mon Sep 17 00:00:00 2001
From: Joshua Wong <joshuawong@anticentri.st>
Date: Sun, 12 May 2024 19:56:03 -0500
Subject: [PATCH 3/4] specialize `Iterator::fold` for `vec::IntoIter`

LLVM currently adds a redundant check for the returned option, in addition
to the `self.ptr != self.end` check when using the default
`Iterator::fold` method that calls `vec::IntoIter::next` in a loop.
---
 library/alloc/src/vec/into_iter.rs | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs
index 280821434027d..c47989337708f 100644
--- a/library/alloc/src/vec/into_iter.rs
+++ b/library/alloc/src/vec/into_iter.rs
@@ -289,13 +289,38 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> {
         };
     }
 
-    fn try_fold<B, F, R>(&mut self, init: B, mut f: F) -> R
+    fn fold<B, F>(mut self, mut accum: B, mut f: F) -> B
+    where
+        F: FnMut(B, Self::Item) -> B,
+    {
+        if T::IS_ZST {
+            while self.ptr.as_ptr() != self.end.cast_mut() {
+                // SAFETY: we just checked that `self.ptr` is in bounds.
+                let tmp = unsafe { self.ptr.read() };
+                // See `next` for why we subtract from `end` here.
+                self.end = self.end.wrapping_byte_sub(1);
+                accum = f(accum, tmp);
+            }
+        } else {
+            // SAFETY: `self.end` can only be null if `T` is a ZST.
+            while self.ptr != non_null!(self.end, T) {
+                // SAFETY: we just checked that `self.ptr` is in bounds.
+                let tmp = unsafe { self.ptr.read() };
+                // SAFETY: the maximum this can be is `self.end`.
+                // Increment `self.ptr` first to avoid double dropping in the event of a panic.
+                self.ptr = unsafe { self.ptr.add(1) };
+                accum = f(accum, tmp);
+            }
+        }
+        accum
+    }
+
+    fn try_fold<B, F, R>(&mut self, mut accum: B, mut f: F) -> R
     where
         Self: Sized,
         F: FnMut(B, Self::Item) -> R,
         R: core::ops::Try<Output = B>,
     {
-        let mut accum = init;
         if T::IS_ZST {
             while self.ptr.as_ptr() != self.end.cast_mut() {
                 // SAFETY: we just checked that `self.ptr` is in bounds.

From 65e302fc3648c2268fecbff10a861f5bd3e0f67d Mon Sep 17 00:00:00 2001
From: Joshua Wong <joshuawong@anticentri.st>
Date: Sat, 18 May 2024 19:15:21 -0500
Subject: [PATCH 4/4] use `Result::into_ok` on infallible result.

---
 library/alloc/src/lib.rs                  | 1 +
 library/alloc/src/vec/in_place_collect.rs | 6 ++----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
index 91b83cfe011f2..4ac0c9b15be7a 100644
--- a/library/alloc/src/lib.rs
+++ b/library/alloc/src/lib.rs
@@ -160,6 +160,7 @@
 #![feature(tuple_trait)]
 #![feature(unicode_internals)]
 #![feature(unsize)]
+#![feature(unwrap_infallible)]
 #![feature(vec_pop_if)]
 // tidy-alphabetical-end
 //
diff --git a/library/alloc/src/vec/in_place_collect.rs b/library/alloc/src/vec/in_place_collect.rs
index 5b0fde93b91c6..22541a2b9d82f 100644
--- a/library/alloc/src/vec/in_place_collect.rs
+++ b/library/alloc/src/vec/in_place_collect.rs
@@ -374,10 +374,8 @@ where
         // - unlike most internal iteration methods, it only takes a &mut self
         // - it lets us thread the write pointer through its innards and get it back in the end
         let sink = InPlaceDrop { inner: dst_buf, dst: dst_buf };
-        let sink = match self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)) {
-            Ok(sink) => sink,
-            Err(never) => match never {},
-        };
+        let sink =
+            self.try_fold::<_, _, Result<_, !>>(sink, write_in_place_with_drop(end)).into_ok();
         // iteration succeeded, don't drop head
         unsafe { ManuallyDrop::new(sink).dst.sub_ptr(dst_buf) }
     }