Skip to content

x86 (32/64): go back to passing SIMD vectors by-ptr #141309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 5, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions compiler/rustc_monomorphize/src/mono_checks/abi_check.rs
Original file line number Diff line number Diff line change
@@ -35,8 +35,6 @@ fn do_check_simd_vector_abi<'tcx>(
is_call: bool,
loc: impl Fn() -> (Span, HirId),
) {
// We check this on all functions, including those using the "Rust" ABI.
// For the "Rust" ABI it would be a bug if the lint ever triggered, but better safe than sorry.
let feature_def = tcx.sess.target.features_for_correct_vector_abi();
let codegen_attrs = tcx.codegen_fn_attrs(def_id);
let have_feature = |feat: Symbol| {
@@ -123,8 +121,9 @@ fn do_check_wasm_abi<'tcx>(
is_call: bool,
loc: impl Fn() -> (Span, HirId),
) {
// Only proceed for `extern "C" fn` on wasm32-unknown-unknown (same check as what `adjust_for_foreign_abi` uses to call `compute_wasm_abi_info`),
// and only proceed if `wasm_c_abi_opt` indicates we should emit the lint.
// Only proceed for `extern "C" fn` on wasm32-unknown-unknown (same check as what
// `adjust_for_foreign_abi` uses to call `compute_wasm_abi_info`), and only proceed if
// `wasm_c_abi_opt` indicates we should emit the lint.
if !(tcx.sess.target.arch == "wasm32"
&& tcx.sess.target.os == "unknown"
&& tcx.wasm_c_abi_opt() == WasmCAbi::Legacy { with_lint: true }
@@ -157,8 +156,15 @@ fn check_instance_abi<'tcx>(tcx: TyCtxt<'tcx>, instance: Instance<'tcx>) {
else {
// An error will be reported during codegen if we cannot determine the ABI of this
// function.
tcx.dcx().delayed_bug("ABI computation failure should lead to compilation failure");
return;
};
// Unlike the call-site check, we do also check "Rust" ABI functions here.
// This should never trigger, *except* if we start making use of vector registers
// for the "Rust" ABI and the user disables those vector registers (which should trigger a
// warning as that's clearly disabling a "required" target feature for this target).
// Using such a function is where disabling the vector register actually can start leading
// to soundness issues, so erroring here seems good.
let loc = || {
let def_id = instance.def_id();
(
@@ -179,7 +185,8 @@ fn check_call_site_abi<'tcx>(
loc: impl Fn() -> (Span, HirId) + Copy,
) {
if callee.fn_sig(tcx).abi().is_rustic_abi() {
// we directly handle the soundness of Rust ABIs
// We directly handle the soundness of Rust ABIs -- so let's skip the majority of
// call sites to avoid a perf regression.
return;
}
let typing_env = ty::TypingEnv::fully_monomorphized();
27 changes: 5 additions & 22 deletions compiler/rustc_target/src/callconv/mod.rs
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@ use rustc_abi::{
use rustc_macros::HashStable_Generic;

pub use crate::spec::AbiMap;
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};
use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};

mod aarch64;
mod amdgpu;
@@ -696,24 +696,6 @@ impl<'a, Ty> FnAbi<'a, Ty> {
_ => {}
};

// Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
// May only return `true` if the target will always pass those arguments the same way,
// no matter what the user does with `-Ctarget-feature`! In other words, whatever
// target features are required to pass a SIMD value in registers must be listed in
// the `abi_required_features` for the current target and ABI.
let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
// On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
// to 128-bit-sized vectors.
"x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
"x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
// FIXME once https://github.com/bytecodealliance/wasmtime/issues/10254 is fixed
// accept vectors up to 128bit rather than vectors of exactly 128bit.
arg.layout.size.bits() == 128
}
// So far, we haven't implemented this logic for any other target.
_ => false,
};

for (arg_idx, arg) in self
.args
.iter_mut()
@@ -813,9 +795,10 @@ impl<'a, Ty> FnAbi<'a, Ty> {
// target feature sets. Some more information about this
// issue can be found in #44367.
//
// Note that the intrinsic ABI is exempt here as those are not
// real functions anyway, and the backend expects very specific types.
if spec.simd_types_indirect && !can_pass_simd_directly(arg) {
// We *could* do better in some cases, e.g. on x86_64 targets where SSE2 is
// required. However, it turns out that that makes LLVM worse at optimizing this
// code, so we pass things indirectly even there. See #139029 for more on that.
if spec.simd_types_indirect {
arg.make_indirect();
}
}
5 changes: 3 additions & 2 deletions tests/codegen/abi-x86-sse.rs
Original file line number Diff line number Diff line change
@@ -27,8 +27,9 @@ trait Copy {}
#[repr(simd)]
pub struct Sse([f32; 4]);

// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
// FIXME: due to #139029 we are passing them all indirectly.
// x86-64: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
// x86-32: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
#[no_mangle]
pub fn sse_id(x: Sse) -> Sse {
20 changes: 5 additions & 15 deletions tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
//
//@ compile-flags: -C no-prepopulate-passes
// LLVM IR isn't very portable and the one tested here depends on the ABI
// which is different between x86 (where we use SSE registers) and others.
// `x86-64` and `x86-32-sse2` are identical, but compiletest does not support
// taking the union of multiple `only` annotations.
//@ revisions: x86-64 x86-32-sse2 other
//@[x86-64] only-x86_64
//@[x86-32-sse2] only-rustc_abi-x86-sse2
//@[other] ignore-rustc_abi-x86-sse2
//@[other] ignore-x86_64
// 32bit MSVC does not align things properly so we suppress high alignment annotations (#112480)
//@ ignore-i686-pc-windows-msvc
//@ ignore-i686-pc-windows-gnu

#![crate_type = "lib"]
#![allow(non_camel_case_types)]
@@ -47,9 +41,7 @@ pub fn build_array_s(x: [f32; 4]) -> S<4> {
#[no_mangle]
pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
// x86-32: ret <4 x float> %[[VAL:.+]]
// x86-64: ret <4 x float> %[[VAL:.+]]
// other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
unsafe { std::mem::transmute(x) }
}

@@ -64,8 +56,6 @@ pub fn build_array_t(x: [f32; 4]) -> T {
#[no_mangle]
pub fn build_array_transmute_t(x: [f32; 4]) -> T {
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
// x86-32: ret <4 x float> %[[VAL:.+]]
// x86-64: ret <4 x float> %[[VAL:.+]]
// other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
unsafe { std::mem::transmute(x) }
}
16 changes: 9 additions & 7 deletions tests/codegen/simd/packed-simd.rs
Original file line number Diff line number Diff line change
@@ -30,16 +30,18 @@ fn load<T, const N: usize>(v: PackedSimd<T, N>) -> FullSimd<T, N> {
}
}

// CHECK-LABEL: define <3 x float> @square_packed_full(ptr{{[a-z_ ]*}} align 4 {{[^,]*}})
// CHECK-LABEL: square_packed_full
// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align (8|16)]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
#[no_mangle]
pub fn square_packed_full(x: PackedSimd<f32, 3>) -> FullSimd<f32, 3> {
// The unoptimized version of this is not very interesting to check
// since `load` does not get inlined.
// opt3-NEXT: start:
// opt3-NEXT: load <3 x float>
// CHECK-NEXT: start
// noopt: alloca [[RET_TYPE]], [[RET_ALIGN]]
// CHECK: load <3 x float>
let x = load(x);
// opt3-NEXT: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
// opt3-NEXT: ret <3 x float> [[VREG:%[a-z0-9_]+]]
// CHECK: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
// CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
// CHECK-NEXT: ret void
unsafe { intrinsics::simd_mul(x, x) }
}