diff --git a/compiler/rustc_codegen_llvm/src/abi.rs b/compiler/rustc_codegen_llvm/src/abi.rs
index e5f5146fac8fb..d2828669d438f 100644
--- a/compiler/rustc_codegen_llvm/src/abi.rs
+++ b/compiler/rustc_codegen_llvm/src/abi.rs
@@ -16,13 +16,15 @@ pub use rustc_middle::ty::layout::{FAT_PTR_ADDR, FAT_PTR_EXTRA};
 use rustc_middle::ty::Ty;
 use rustc_session::config;
 pub use rustc_target::abi::call::*;
-use rustc_target::abi::{self, HasDataLayout, Int};
+use rustc_target::abi::{self, HasDataLayout, Int, Size};
 pub use rustc_target::spec::abi::Abi;
 use rustc_target::spec::SanitizerSet;
 
 use libc::c_uint;
 use smallvec::SmallVec;
 
+use std::cmp;
+
 pub trait ArgAttributesExt {
     fn apply_attrs_to_llfn(&self, idx: AttributePlace, cx: &CodegenCx<'_, '_>, llfn: &Value);
     fn apply_attrs_to_callsite(
@@ -130,42 +132,36 @@ impl LlvmType for Reg {
 impl LlvmType for CastTarget {
     fn llvm_type<'ll>(&self, cx: &CodegenCx<'ll, '_>) -> &'ll Type {
         let rest_ll_unit = self.rest.unit.llvm_type(cx);
-        let (rest_count, rem_bytes) = if self.rest.unit.size.bytes() == 0 {
-            (0, 0)
+        let rest_count = if self.rest.total == Size::ZERO {
+            0
         } else {
-            (
-                self.rest.total.bytes() / self.rest.unit.size.bytes(),
-                self.rest.total.bytes() % self.rest.unit.size.bytes(),
-            )
+            assert_ne!(
+                self.rest.unit.size,
+                Size::ZERO,
+                "total size {:?} cannot be divided into units of zero size",
+                self.rest.total
+            );
+            if self.rest.total.bytes() % self.rest.unit.size.bytes() != 0 {
+                assert_eq!(self.rest.unit.kind, RegKind::Integer, "only int regs can be split");
+            }
+            self.rest.total.bytes().div_ceil(self.rest.unit.size.bytes())
         };
 
+        // Simplify to a single unit or an array if there's no prefix.
+        // This produces the same layout, but using a simpler type.
         if self.prefix.iter().all(|x| x.is_none()) {
-            // Simplify to a single unit when there is no prefix and size <= unit size
-            if self.rest.total <= self.rest.unit.size {
+            if rest_count == 1 {
                 return rest_ll_unit;
             }
 
-            // Simplify to array when all chunks are the same size and type
-            if rem_bytes == 0 {
-                return cx.type_array(rest_ll_unit, rest_count);
-            }
-        }
-
-        // Create list of fields in the main structure
-        let mut args: Vec<_> = self
-            .prefix
-            .iter()
-            .flat_map(|option_reg| option_reg.map(|reg| reg.llvm_type(cx)))
-            .chain((0..rest_count).map(|_| rest_ll_unit))
-            .collect();
-
-        // Append final integer
-        if rem_bytes != 0 {
-            // Only integers can be really split further.
-            assert_eq!(self.rest.unit.kind, RegKind::Integer);
-            args.push(cx.type_ix(rem_bytes * 8));
+            return cx.type_array(rest_ll_unit, rest_count);
         }
 
+        // Generate a struct type with the prefix and the "rest" arguments.
+        let prefix_args =
+            self.prefix.iter().flat_map(|option_reg| option_reg.map(|reg| reg.llvm_type(cx)));
+        let rest_args = (0..rest_count).map(|_| rest_ll_unit);
+        let args: Vec<_> = prefix_args.chain(rest_args).collect();
         cx.type_struct(&args, false)
     }
 }
@@ -215,47 +211,33 @@ impl<'ll, 'tcx> ArgAbiExt<'ll, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
                 bug!("unsized `ArgAbi` must be handled through `store_fn_arg`");
             }
             PassMode::Cast { cast, pad_i32: _ } => {
-                // FIXME(eddyb): Figure out when the simpler Store is safe, clang
-                // uses it for i16 -> {i8, i8}, but not for i24 -> {i8, i8, i8}.
-                let can_store_through_cast_ptr = false;
-                if can_store_through_cast_ptr {
-                    bx.store(val, dst.llval, self.layout.align.abi);
-                } else {
-                    // The actual return type is a struct, but the ABI
-                    // adaptation code has cast it into some scalar type. The
-                    // code that follows is the only reliable way I have
-                    // found to do a transform like i64 -> {i32,i32}.
-                    // Basically we dump the data onto the stack then memcpy it.
-                    //
-                    // Other approaches I tried:
-                    // - Casting rust ret pointer to the foreign type and using Store
-                    //   is (a) unsafe if size of foreign type > size of rust type and
-                    //   (b) runs afoul of strict aliasing rules, yielding invalid
-                    //   assembly under -O (specifically, the store gets removed).
-                    // - Truncating foreign type to correct integral type and then
-                    //   bitcasting to the struct type yields invalid cast errors.
-
-                    // We instead thus allocate some scratch space...
-                    let scratch_size = cast.size(bx);
-                    let scratch_align = cast.align(bx);
-                    let llscratch = bx.alloca(cast.llvm_type(bx), scratch_align);
-                    bx.lifetime_start(llscratch, scratch_size);
-
-                    // ... where we first store the value...
-                    bx.store(val, llscratch, scratch_align);
-
-                    // ... and then memcpy it to the intended destination.
-                    bx.memcpy(
-                        dst.llval,
-                        self.layout.align.abi,
-                        llscratch,
-                        scratch_align,
-                        bx.const_usize(self.layout.size.bytes()),
-                        MemFlags::empty(),
-                    );
-
-                    bx.lifetime_end(llscratch, scratch_size);
-                }
+                // The ABI mandates that the value is passed as a different struct representation.
+                // Spill and reload it from the stack to convert from the ABI representation to
+                // the Rust representation.
+                let scratch_size = cast.size(bx);
+                let scratch_align = cast.align(bx);
+                // Note that the ABI type may be either larger or smaller than the Rust type,
+                // due to the presence or absence of trailing padding. For example:
+                // - On some ABIs, the Rust layout { f64, f32, <f32 padding> } may omit padding
+                //   when passed by value, making it smaller.
+                // - On some ABIs, the Rust layout { u16, u16, u16 } may be padded up to 8 bytes
+                //   when passed by value, making it larger.
+                let copy_bytes = cmp::min(scratch_size.bytes(), self.layout.size.bytes());
+                // Allocate some scratch space...
+                let llscratch = bx.alloca(cast.llvm_type(bx), scratch_align);
+                bx.lifetime_start(llscratch, scratch_size);
+                // ...store the value...
+                bx.store(val, llscratch, scratch_align);
+                // ... and then memcpy it to the intended destination.
+                bx.memcpy(
+                    dst.llval,
+                    self.layout.align.abi,
+                    llscratch,
+                    scratch_align,
+                    bx.const_usize(copy_bytes),
+                    MemFlags::empty(),
+                );
+                bx.lifetime_end(llscratch, scratch_size);
             }
             _ => {
                 OperandRef::from_immediate_or_packed_pair(bx, val, self.layout).val.store(bx, dst);
diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs
index 9bb2a52826585..333c03ce82377 100644
--- a/compiler/rustc_codegen_ssa/src/mir/block.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/block.rs
@@ -1471,9 +1471,35 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
 
         if by_ref && !arg.is_indirect() {
             // Have to load the argument, maybe while casting it.
-            if let PassMode::Cast { cast: ty, .. } = &arg.mode {
-                let llty = bx.cast_backend_type(ty);
-                llval = bx.load(llty, llval, align.min(arg.layout.align.abi));
+            if let PassMode::Cast { cast, pad_i32: _ } = &arg.mode {
+                // The ABI mandates that the value is passed as a different struct representation.
+                // Spill and reload it from the stack to convert from the Rust representation to
+                // the ABI representation.
+                let scratch_size = cast.size(bx);
+                let scratch_align = cast.align(bx);
+                // Note that the ABI type may be either larger or smaller than the Rust type,
+                // due to the presence or absence of trailing padding. For example:
+                // - On some ABIs, the Rust layout { f64, f32, <f32 padding> } may omit padding
+                //   when passed by value, making it smaller.
+                // - On some ABIs, the Rust layout { u16, u16, u16 } may be padded up to 8 bytes
+                //   when passed by value, making it larger.
+                let copy_bytes = cmp::min(scratch_size.bytes(), arg.layout.size.bytes());
+                // Allocate some scratch space...
+                let llscratch = bx.alloca(bx.cast_backend_type(cast), scratch_align);
+                bx.lifetime_start(llscratch, scratch_size);
+                // ...memcpy the value...
+                bx.memcpy(
+                    llscratch,
+                    scratch_align,
+                    llval,
+                    align,
+                    bx.const_usize(copy_bytes),
+                    MemFlags::empty(),
+                );
+                // ...and then load it with the ABI type.
+                let cast_ty = bx.cast_backend_type(cast);
+                llval = bx.load(cast_ty, llscratch, scratch_align);
+                bx.lifetime_end(llscratch, scratch_size);
             } else {
                 // We can't use `PlaceRef::load` here because the argument
                 // may have a type we don't treat as immediate, but the ABI
diff --git a/compiler/rustc_target/src/abi/call/mod.rs b/compiler/rustc_target/src/abi/call/mod.rs
index 486afc5f8f30a..643ba9f1cddcc 100644
--- a/compiler/rustc_target/src/abi/call/mod.rs
+++ b/compiler/rustc_target/src/abi/call/mod.rs
@@ -251,9 +251,9 @@ pub struct Uniform {
     /// The total size of the argument, which can be:
     /// * equal to `unit.size` (one scalar/vector),
     /// * a multiple of `unit.size` (an array of scalar/vectors),
-    /// * if `unit.kind` is `Integer`, the last element
-    ///   can be shorter, i.e., `{ i64, i64, i32 }` for
-    ///   64-bit integers with a total size of 20 bytes.
+    /// * if `unit.kind` is `Integer`, the last element can be shorter, i.e., `{ i64, i64, i32 }`
+    ///   for 64-bit integers with a total size of 20 bytes. When the argument is actually passed,
+    ///   this size will be rounded up to the nearest multiple of `unit.size`.
     pub total: Size,
 }
 
@@ -319,14 +319,17 @@ impl CastTarget {
     }
 
     pub fn size<C: HasDataLayout>(&self, _cx: &C) -> Size {
-        let mut size = self.rest.total;
-        for i in 0..self.prefix.iter().count() {
-            match self.prefix[i] {
-                Some(v) => size += v.size,
-                None => {}
-            }
-        }
-        return size;
+        // Prefix arguments are passed in specific designated registers
+        let prefix_size = self
+            .prefix
+            .iter()
+            .filter_map(|x| x.map(|reg| reg.size))
+            .fold(Size::ZERO, |acc, size| acc + size);
+        // Remaining arguments are passed in chunks of the unit size
+        let rest_size =
+            self.rest.unit.size * self.rest.total.bytes().div_ceil(self.rest.unit.size.bytes());
+
+        prefix_size + rest_size
     }
 
     pub fn align<C: HasDataLayout>(&self, cx: &C) -> Align {
diff --git a/tests/auxiliary/rust_test_helpers.c b/tests/auxiliary/rust_test_helpers.c
index 977ea487a9804..965df44c67608 100644
--- a/tests/auxiliary/rust_test_helpers.c
+++ b/tests/auxiliary/rust_test_helpers.c
@@ -118,6 +118,30 @@ rust_dbg_extern_identity_TwoDoubles(struct TwoDoubles u) {
     return u;
 }
 
+struct FiveU16s {
+    uint16_t one;
+    uint16_t two;
+    uint16_t three;
+    uint16_t four;
+    uint16_t five;
+};
+
+struct FiveU16s
+rust_dbg_extern_return_FiveU16s() {
+    struct FiveU16s s;
+    s.one = 10;
+    s.two = 20;
+    s.three = 30;
+    s.four = 40;
+    s.five = 50;
+    return s;
+}
+
+struct FiveU16s
+rust_dbg_extern_identity_FiveU16s(struct FiveU16s u) {
+    return u;
+}
+
 struct ManyInts {
     int8_t arg1;
     int16_t arg2;
diff --git a/tests/codegen/cast-target-abi.rs b/tests/codegen/cast-target-abi.rs
new file mode 100644
index 0000000000000..e6024f03425f4
--- /dev/null
+++ b/tests/codegen/cast-target-abi.rs
@@ -0,0 +1,280 @@
+// ignore-tidy-linelength
+//@ revisions:aarch64 loongarch64 powerpc64 sparc64
+//@ compile-flags: -O -C no-prepopulate-passes
+
+//@[aarch64] compile-flags: --target aarch64-unknown-linux-gnu
+//@[aarch64] needs-llvm-components: arm
+//@[loongarch64] compile-flags: --target loongarch64-unknown-linux-gnu
+//@[loongarch64] needs-llvm-components: loongarch
+//@[powerpc64] compile-flags: --target powerpc64-unknown-linux-gnu
+//@[powerpc64] needs-llvm-components: powerpc
+//@[sparc64] compile-flags: --target sparc64-unknown-linux-gnu
+//@[sparc64] needs-llvm-components: sparc
+
+// Tests that arguments with `PassMode::Cast` are handled correctly.
+
+#![feature(no_core, lang_items)]
+#![crate_type = "lib"]
+#![no_std]
+#![no_core]
+
+#[lang="sized"] trait Sized { }
+#[lang="freeze"] trait Freeze { }
+#[lang="copy"] trait Copy { }
+
+// This struct will be passed as a single `i64` or `i32`.
+// This may be (if `i64)) larger than the Rust layout, which is just `{ i16, i16 }`.
+#[repr(C)]
+pub struct TwoU16s {
+    a: u16,
+    b: u16,
+}
+
+// This struct will be passed as `[2 x i64]`.
+// This is larger than the Rust layout.
+#[repr(C)]
+pub struct FiveU16s {
+    a: u16,
+    b: u16,
+    c: u16,
+    d: u16,
+    e: u16,
+}
+
+// This struct will be passed as `[2 x double]`.
+// This is the same as the Rust layout.
+#[repr(C)]
+pub struct DoubleDouble {
+    f: f64,
+    g: f64,
+}
+
+// On loongarch, this struct will be passed as `{ double, float }`.
+// This is smaller than the Rust layout, which has trailing padding (`{ f64, f32, <f32 padding> }`)
+#[repr(C)]
+pub struct DoubleFloat {
+    f: f64,
+    g: f32,
+}
+
+extern "C" {
+    fn receives_twou16s(x: TwoU16s);
+    fn returns_twou16s() -> TwoU16s;
+
+    fn receives_fiveu16s(x: FiveU16s);
+    fn returns_fiveu16s() -> FiveU16s;
+
+    fn receives_doubledouble(x: DoubleDouble);
+    fn returns_doubledouble() -> DoubleDouble;
+
+    // These functions cause an ICE in sparc64 ABI code (https://github.com/rust-lang/rust/issues/122620)
+    #[cfg(not(target_arch = "sparc64"))]
+    fn receives_doublefloat(x: DoubleFloat);
+    #[cfg(not(target_arch = "sparc64"))]
+    fn returns_doublefloat() -> DoubleFloat;
+}
+
+// CHECK-LABEL: @call_twou16s
+#[no_mangle]
+pub unsafe fn call_twou16s() {
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+    // powerpc64:   [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i32]], align [[ABI_ALIGN:4]]
+    // sparc64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+
+    // CHECK: [[RUST_ALLOCA:%.+]] = alloca %TwoU16s, align [[RUST_ALIGN:2]]
+
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 4, i1 false)
+    // CHECK: [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // CHECK: call void @receives_twou16s([[ABI_TYPE]] [[ABI_VALUE]])
+    let x = TwoU16s { a: 1, b: 2 };
+    receives_twou16s(x);
+}
+
+// CHECK-LABEL: @return_twou16s
+#[no_mangle]
+pub unsafe fn return_twou16s() -> TwoU16s {
+    // powerpc returns this struct via sret pointer, it doesn't use the cast ABI.
+
+    // powerpc64: [[RETVAL:%.+]] = alloca %TwoU16s, align 2
+    // powerpc64: call void @returns_twou16s(ptr {{.+}} [[RETVAL]])
+
+
+    // The other targets copy the cast ABI type to an alloca.
+
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+    // sparc64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:i64]], align [[ABI_ALIGN:8]]
+
+    // aarch64:     [[RUST_ALLOCA:%.+]] = alloca %TwoU16s, align [[RUST_ALIGN:2]]
+    // loongarch64: [[RUST_ALLOCA:%.+]] = alloca %TwoU16s, align [[RUST_ALIGN:2]]
+    // sparc64:     [[RUST_ALLOCA:%.+]] = alloca %TwoU16s, align [[RUST_ALIGN:2]]
+
+    // aarch64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_twou16s()
+    // loongarch64: [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_twou16s()
+    // sparc64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_twou16s()
+
+    // aarch64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // loongarch64: store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // sparc64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+
+    // aarch64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 4, i1 false)
+    // loongarch64: call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 4, i1 false)
+    // sparc64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 4, i1 false)
+    returns_twou16s()
+}
+
+// CHECK-LABEL: @call_fiveu16s
+#[no_mangle]
+pub unsafe fn call_fiveu16s() {
+    // CHECK: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+
+    // CHECK: [[RUST_ALLOCA:%.+]] = alloca %FiveU16s, align 2
+
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 10, i1 false)
+    // CHECK: [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // CHECK: call void @receives_fiveu16s([[ABI_TYPE]] [[ABI_VALUE]])
+    let x = FiveU16s { a: 1, b: 2, c: 3, d: 4, e: 5 };
+    receives_fiveu16s(x);
+}
+
+// CHECK-LABEL: @return_fiveu16s
+// CHECK-SAME: (ptr {{.+}} sret([10 x i8]) align [[RUST_ALIGN:2]] dereferenceable(10) [[RET_PTR:%.+]])
+#[no_mangle]
+pub unsafe fn return_fiveu16s() -> FiveU16s {
+    // powerpc returns this struct via sret pointer, it doesn't use the cast ABI.
+
+    // powerpc64: call void @returns_fiveu16s(ptr {{.+}} [[RET_PTR]])
+
+
+    // The other targets copy the cast ABI type to the sret pointer.
+
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+    // sparc64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+
+    // aarch64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_fiveu16s()
+    // loongarch64: [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_fiveu16s()
+    // sparc64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_fiveu16s()
+
+    // aarch64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // loongarch64: store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // sparc64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+
+    // aarch64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RET_PTR]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 10, i1 false)
+    // loongarch64: call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RET_PTR]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 10, i1 false)
+    // sparc64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RET_PTR]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 10, i1 false)
+    returns_fiveu16s()
+}
+
+// CHECK-LABEL: @call_doubledouble
+#[no_mangle]
+pub unsafe fn call_doubledouble() {
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x double\]]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, double }]], align [[ABI_ALIGN:8]]
+    // powerpc64:   [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+    // sparc64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, double }]], align [[ABI_ALIGN:8]]
+
+    // CHECK: [[RUST_ALLOCA:%.+]] = alloca %DoubleDouble, align [[RUST_ALIGN:8]]
+
+    // CHECK: call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 16, i1 false)
+    // CHECK: [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // CHECK: call void @receives_doubledouble([[ABI_TYPE]] [[ABI_VALUE]])
+    let x = DoubleDouble { f: 1., g: 2. };
+    receives_doubledouble(x);
+}
+
+// CHECK-LABEL: @return_doubledouble
+#[no_mangle]
+pub unsafe fn return_doubledouble() -> DoubleDouble {
+    // powerpc returns this struct via sret pointer, it doesn't use the cast ABI.
+
+    // powerpc64: [[RETVAL:%.+]] = alloca %DoubleDouble, align 8
+    // powerpc64: call void @returns_doubledouble(ptr {{.+}} [[RETVAL]])
+
+
+    // The other targets copy the cast ABI type to an alloca.
+
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x double\]]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, double }]], align [[ABI_ALIGN:8]]
+    // sparc64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, double }]], align [[ABI_ALIGN:8]]
+
+    // aarch64:     [[RUST_ALLOCA:%.+]] = alloca %DoubleDouble, align [[RUST_ALIGN:8]]
+    // loongarch64: [[RUST_ALLOCA:%.+]] = alloca %DoubleDouble, align [[RUST_ALIGN:8]]
+    // sparc64:     [[RUST_ALLOCA:%.+]] = alloca %DoubleDouble, align [[RUST_ALIGN:8]]
+
+    // aarch64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_doubledouble()
+    // loongarch64: [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_doubledouble()
+    // sparc64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_doubledouble()
+
+    // aarch64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // loongarch64: store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // sparc64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+
+    // aarch64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 16, i1 false)
+    // loongarch64: call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 16, i1 false)
+    // sparc64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 16, i1 false)
+    returns_doubledouble()
+}
+
+// This test causes an ICE in sparc64 ABI code (https://github.com/rust-lang/rust/issues/122620)
+#[cfg(not(target_arch = "sparc64"))]
+// aarch64-LABEL:     @call_doublefloat
+// loongarch64-LABEL: @call_doublefloat
+// powerpc64-LABEL:   @call_doublefloat
+#[no_mangle]
+pub unsafe fn call_doublefloat() {
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, float }]], align [[ABI_ALIGN:8]]
+    // powerpc64:   [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+
+    // aarch64:     [[RUST_ALLOCA:%.+]] = alloca %DoubleFloat, align [[RUST_ALIGN:8]]
+    // loongarch64: [[RUST_ALLOCA:%.+]] = alloca %DoubleFloat, align [[RUST_ALIGN:8]]
+    // powerpc64:   [[RUST_ALLOCA:%.+]] = alloca %DoubleFloat, align [[RUST_ALIGN:8]]
+
+    // aarch64:     call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 16, i1 false)
+    // loongarch64: call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 12, i1 false)
+    // powerpc64:   call void @llvm.memcpy.{{.+}}(ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], i64 16, i1 false)
+
+    // aarch64:     [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // loongarch64: [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // powerpc64:   [[ABI_VALUE:%.+]] = load [[ABI_TYPE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+
+    // aarch64:     call void @receives_doublefloat([[ABI_TYPE]] {{(inreg )?}}[[ABI_VALUE]])
+    // loongarch64: call void @receives_doublefloat([[ABI_TYPE]] {{(inreg )?}}[[ABI_VALUE]])
+    // powerpc64:   call void @receives_doublefloat([[ABI_TYPE]] {{(inreg )?}}[[ABI_VALUE]])
+    let x = DoubleFloat { f: 1., g: 2. };
+    receives_doublefloat(x);
+}
+
+// This test causes an ICE in sparc64 ABI code (https://github.com/rust-lang/rust/issues/122620)
+#[cfg(not(target_arch = "sparc64"))]
+// aarch64-LABEL:     @return_doublefloat
+// loongarch64-LABEL: @return_doublefloat
+// powerpc64-LABEL:   @return_doublefloat
+#[no_mangle]
+pub unsafe fn return_doublefloat() -> DoubleFloat {
+    // powerpc returns this struct via sret pointer, it doesn't use the cast ABI.
+
+    // powerpc64: [[RETVAL:%.+]] = alloca %DoubleFloat, align 8
+    // powerpc64: call void @returns_doublefloat(ptr {{.+}} [[RETVAL]])
+
+
+    // The other targets copy the cast ABI type to an alloca.
+
+    // aarch64:     [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:\[2 x i64\]]], align [[ABI_ALIGN:8]]
+    // loongarch64: [[ABI_ALLOCA:%.+]] = alloca [[ABI_TYPE:{ double, float }]], align [[ABI_ALIGN:8]]
+
+    // aarch64:     [[RUST_ALLOCA:%.+]] = alloca %DoubleFloat, align [[RUST_ALIGN:8]]
+    // loongarch64: [[RUST_ALLOCA:%.+]] = alloca %DoubleFloat, align [[RUST_ALIGN:8]]
+
+    // aarch64:     [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_doublefloat()
+    // loongarch64: [[ABI_VALUE:%.+]] = call [[ABI_TYPE]] @returns_doublefloat()
+
+    // aarch64:     store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+    // loongarch64: store [[ABI_TYPE]] [[ABI_VALUE]], ptr [[ABI_ALLOCA]], align [[ABI_ALIGN]]
+
+    // aarch64:     call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 16, i1 false)
+    // loongarch64: call void @llvm.memcpy.{{.+}}(ptr align [[RUST_ALIGN]] [[RUST_ALLOCA]], ptr align [[ABI_ALIGN]] [[ABI_ALLOCA]], i64 12, i1 false)
+    returns_doublefloat()
+}
diff --git a/tests/codegen/cffi/ffi-out-of-bounds-loads.rs b/tests/codegen/cffi/ffi-out-of-bounds-loads.rs
index 7eda6cf4d576e..8b32e902b3f55 100644
--- a/tests/codegen/cffi/ffi-out-of-bounds-loads.rs
+++ b/tests/codegen/cffi/ffi-out-of-bounds-loads.rs
@@ -1,8 +1,21 @@
+//@ revisions: linux apple
+//@ compile-flags: -C opt-level=0 -C no-prepopulate-passes
+
+//@[linux] compile-flags: --target x86_64-unknown-linux-gnu
+//@[linux] needs-llvm-components: x86
+//@[apple] compile-flags: --target x86_64-apple-darwin
+//@[apple] needs-llvm-components: x86
+
 // Regression test for #29988
 
-//@ compile-flags: -C no-prepopulate-passes
-//@ only-x86_64
-//@ ignore-windows
+#![feature(no_core, lang_items)]
+#![crate_type = "lib"]
+#![no_std]
+#![no_core]
+
+#[lang="sized"] trait Sized { }
+#[lang="freeze"] trait Freeze { }
+#[lang="copy"] trait Copy { }
 
 #[repr(C)]
 struct S {
@@ -15,11 +28,14 @@ extern "C" {
     fn foo(s: S);
 }
 
-fn main() {
+// CHECK-LABEL: @test
+#[no_mangle]
+pub fn test() {
     let s = S { f1: 1, f2: 2, f3: 3 };
     unsafe {
-        // CHECK: load { i64, i32 }, {{.*}}, align 4
-        // CHECK: call void @foo({ i64, i32 } {{.*}})
+        // CHECK: [[ALLOCA:%.+]] = alloca { i64, i32 }, align 8
+        // CHECK: [[LOAD:%.+]] = load { i64, i32 }, ptr [[ALLOCA]], align 8
+        // CHECK: call void @foo({ i64, i32 } [[LOAD]])
         foo(s);
     }
 }
diff --git a/tests/ui/abi/extern/extern-pass-FiveU16s.rs b/tests/ui/abi/extern/extern-pass-FiveU16s.rs
new file mode 100644
index 0000000000000..5f1307beb28e6
--- /dev/null
+++ b/tests/ui/abi/extern/extern-pass-FiveU16s.rs
@@ -0,0 +1,30 @@
+//@ run-pass
+#![allow(improper_ctypes)]
+
+// Test a foreign function that accepts and returns a struct by value.
+
+// FiveU16s in particular is interesting because it is larger than a single 64 bit or 32 bit
+// register, which are used as cast destinations on some targets, but does not evenly divide those
+// sizes, causing there to be padding in the last element.
+
+#[derive(Copy, Clone, PartialEq, Debug)]
+pub struct FiveU16s {
+    one: u16,
+    two: u16,
+    three: u16,
+    four: u16,
+    five: u16,
+}
+
+#[link(name = "rust_test_helpers", kind = "static")]
+extern "C" {
+    pub fn rust_dbg_extern_identity_FiveU16s(v: FiveU16s) -> FiveU16s;
+}
+
+pub fn main() {
+    unsafe {
+        let x = FiveU16s { one: 22, two: 23, three: 24, four: 25, five: 26 };
+        let y = rust_dbg_extern_identity_FiveU16s(x);
+        assert_eq!(x, y);
+    }
+}
diff --git a/tests/ui/abi/extern/extern-return-FiveU16s.rs b/tests/ui/abi/extern/extern-return-FiveU16s.rs
new file mode 100644
index 0000000000000..d8ae8b2661c5a
--- /dev/null
+++ b/tests/ui/abi/extern/extern-return-FiveU16s.rs
@@ -0,0 +1,26 @@
+//@ run-pass
+#![allow(improper_ctypes)]
+
+pub struct FiveU16s {
+    one: u16,
+    two: u16,
+    three: u16,
+    four: u16,
+    five: u16,
+}
+
+#[link(name = "rust_test_helpers", kind = "static")]
+extern "C" {
+    pub fn rust_dbg_extern_return_FiveU16s() -> FiveU16s;
+}
+
+pub fn main() {
+    unsafe {
+        let y = rust_dbg_extern_return_FiveU16s();
+        assert_eq!(y.one, 10);
+        assert_eq!(y.two, 20);
+        assert_eq!(y.three, 30);
+        assert_eq!(y.four, 40);
+        assert_eq!(y.five, 50);
+    }
+}