diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 165caf9c3a..1984a1042a 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -2276,6 +2276,86 @@ pub unsafe fn vqnegq_s64(a: int64x2_t) -> int64x2_t { vqnegq_s64_(a) } +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqsub))] +pub unsafe fn vqsubb_s8(a: i8, b: i8) -> i8 { + let a: int8x8_t = vdup_n_s8(a); + let b: int8x8_t = vdup_n_s8(b); + simd_extract(vqsub_s8(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqsub))] +pub unsafe fn vqsubh_s16(a: i16, b: i16) -> i16 { + let a: int16x4_t = vdup_n_s16(a); + let b: int16x4_t = vdup_n_s16(b); + simd_extract(vqsub_s16(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqsub))] +pub unsafe fn vqsubs_s32(a: i32, b: i32) -> i32 { + let a: int32x2_t = vdup_n_s32(a); + let b: int32x2_t = vdup_n_s32(b); + simd_extract(vqsub_s32(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqsub))] +pub unsafe fn vqsubd_s64(a: i64, b: i64) -> i64 { + let a: int64x1_t = vdup_n_s64(a); + let b: int64x1_t = vdup_n_s64(b); + simd_extract(vqsub_s64(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqsub))] +pub unsafe fn vqsubb_u8(a: u8, b: u8) -> u8 { + let a: uint8x8_t = vdup_n_u8(a); + let b: uint8x8_t = vdup_n_u8(b); + simd_extract(vqsub_u8(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqsub))] +pub unsafe fn vqsubh_u16(a: u16, b: u16) -> u16 { + let a: uint16x4_t = vdup_n_u16(a); + let b: uint16x4_t = vdup_n_u16(b); + simd_extract(vqsub_u16(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqsub))] +pub unsafe fn vqsubs_u32(a: u32, b: u32) -> u32 { + let a: uint32x2_t = vdup_n_u32(a); + let b: uint32x2_t = vdup_n_u32(b); + simd_extract(vqsub_u32(a, b), 0) +} + +/// Saturating subtract +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqsub))] +pub unsafe fn vqsubd_u64(a: u64, b: u64) -> u64 { + let a: uint64x1_t = vdup_n_u64(a); + let b: uint64x1_t = vdup_n_u64(b); + simd_extract(vqsub_u64(a, b), 0) +} + /// Reverse bit order #[inline] #[target_feature(enable = "neon")] @@ -2698,6 +2778,86 @@ pub unsafe fn vrndiq_f64(a: float64x2_t) -> float64x2_t { vrndiq_f64_(a) } +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqadd))] +pub unsafe fn vqaddb_s8(a: i8, b: i8) -> i8 { + let a: int8x8_t = vdup_n_s8(a); + let b: int8x8_t = vdup_n_s8(b); + simd_extract(vqadd_s8(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqadd))] +pub unsafe fn vqaddh_s16(a: i16, b: i16) -> i16 { + let a: int16x4_t = vdup_n_s16(a); + let b: int16x4_t = vdup_n_s16(b); + simd_extract(vqadd_s16(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqadd))] +pub unsafe fn vqadds_s32(a: i32, b: i32) -> i32 { + let a: int32x2_t = vdup_n_s32(a); + let b: int32x2_t = vdup_n_s32(b); + simd_extract(vqadd_s32(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqadd))] +pub unsafe fn vqaddd_s64(a: i64, b: i64) -> i64 { + let a: int64x1_t = vdup_n_s64(a); + let b: int64x1_t = vdup_n_s64(b); + simd_extract(vqadd_s64(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqadd))] +pub unsafe fn vqaddb_u8(a: u8, b: u8) -> u8 { + let a: uint8x8_t = vdup_n_u8(a); + let b: uint8x8_t = vdup_n_u8(b); + simd_extract(vqadd_u8(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqadd))] +pub unsafe fn vqaddh_u16(a: u16, b: u16) -> u16 { + let a: uint16x4_t = vdup_n_u16(a); + let b: uint16x4_t = vdup_n_u16(b); + simd_extract(vqadd_u16(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqadd))] +pub unsafe fn vqadds_u32(a: u32, b: u32) -> u32 { + let a: uint32x2_t = vdup_n_u32(a); + let b: uint32x2_t = vdup_n_u32(b); + simd_extract(vqadd_u32(a, b), 0) +} + +/// Saturating add +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(uqadd))] +pub unsafe fn vqaddd_u64(a: u64, b: u64) -> u64 { + let a: uint64x1_t = vdup_n_u64(a); + let b: uint64x1_t = vdup_n_u64(b); + simd_extract(vqadd_u64(a, b), 0) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -3184,6 +3344,431 @@ pub unsafe fn vpminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { vpminnmq_f32_(a, b) } +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull))] +pub unsafe fn vqdmullh_s16(a: i16, b: i16) -> i32 { + let a: int16x4_t = vdup_n_s16(a); + let b: int16x4_t = vdup_n_s16(b); + simd_extract(vqdmull_s16(a, b), 0) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull))] +pub unsafe fn vqdmulls_s32(a: i32, b: i32) -> i64 { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulls.scalar")] + fn vqdmulls_s32_(a: i32, b: i32) -> i64; + } + vqdmulls_s32_(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2))] +pub unsafe fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: int16x4_t = simd_shuffle4(b, b, [4, 5, 6, 7]); + vqdmull_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2))] +pub unsafe fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + let a: int32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: int32x2_t = simd_shuffle2(b, b, [2, 3]); + vqdmull_s32(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2))] +pub unsafe fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t { + let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: int16x4_t = vdup_n_s16(b); + vqdmull_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2))] +pub unsafe fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t { + let a: int32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: int32x2_t = vdup_n_s32(b); + vqdmull_s32(a, b) +} + +/// Vector saturating doubling long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 4))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]); + vqdmull_s16(a, b) +} + +/// Vector saturating doubling long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]); + vqdmull_s32(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmullh_lane_s16(a: i16, b: int16x4_t) -> i32 { + static_assert_imm2!(N); + let b: i16 = simd_extract(b, N as u32); + vqdmullh_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 4))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmullh_laneq_s16(a: i16, b: int16x8_t) -> i32 { + static_assert_imm3!(N); + let b: i16 = simd_extract(b, N as u32); + vqdmullh_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulls_lane_s32(a: i32, b: int32x2_t) -> i64 { + static_assert_imm1!(N); + let b: i32 = simd_extract(b, N as u32); + vqdmulls_s32(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulls_laneq_s32(a: i32, b: int32x4_t) -> i64 { + static_assert_imm2!(N); + let b: i32 = simd_extract(b, N as u32); + vqdmulls_s32(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]); + vqdmull_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2, N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + let a: int32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]); + vqdmull_s32(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2, N = 4))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + let a: int16x4_t = simd_shuffle4(a, a, [4, 5, 6, 7]); + let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]); + vqdmull_s16(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmull2, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + let a: int32x2_t = simd_shuffle2(a, a, [2, 3]); + let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]); + vqdmull_s32(a, b) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2))] +pub unsafe fn vqdmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + vqaddq_s32(a, vqdmull_high_s16(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2))] +pub unsafe fn vqdmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + vqaddq_s64(a, vqdmull_high_s32(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2))] +pub unsafe fn vqdmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t { + vqaddq_s32(a, vqdmull_high_n_s16(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2))] +pub unsafe fn vqdmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t { + vqaddq_s64(a, vqdmull_high_n_s32(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal, N = 2))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + vqaddq_s32(a, vqdmull_laneq_s16::(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + vqaddq_s64(a, vqdmull_laneq_s32::(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_high_lane_s16(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + vqaddq_s32(a, vqdmull_high_lane_s16::(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_high_laneq_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + vqaddq_s32(a, vqdmull_high_laneq_s16::(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_high_lane_s32(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + vqaddq_s64(a, vqdmull_high_lane_s32::(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlal2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_high_laneq_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + vqaddq_s64(a, vqdmull_high_laneq_s32::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2))] +pub unsafe fn vqdmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + vqsubq_s32(a, vqdmull_high_s16(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2))] +pub unsafe fn vqdmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + vqsubq_s64(a, vqdmull_high_s32(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2))] +pub unsafe fn vqdmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t { + vqsubq_s32(a, vqdmull_high_n_s16(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2))] +pub unsafe fn vqdmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t { + vqsubq_s64(a, vqdmull_high_n_s32(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl, N = 2))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + vqsubq_s32(a, vqdmull_laneq_s16::(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + vqsubq_s64(a, vqdmull_laneq_s32::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_high_lane_s16(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + vqsubq_s32(a, vqdmull_high_lane_s16::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_high_laneq_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t { + static_assert_imm3!(N); + vqsubq_s32(a, vqdmull_high_laneq_s16::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_high_lane_s32(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + vqsubq_s64(a, vqdmull_high_lane_s32::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmlsl2, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_high_laneq_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t { + static_assert_imm2!(N); + vqsubq_s64(a, vqdmull_high_laneq_s32::(b, c)) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh))] +pub unsafe fn vqdmulhh_s16(a: i16, b: i16) -> i16 { + let a: int16x4_t = vdup_n_s16(a); + let b: int16x4_t = vdup_n_s16(b); + simd_extract(vqdmulh_s16(a, b), 0) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh))] +pub unsafe fn vqdmulhs_s32(a: i32, b: i32) -> i32 { + let a: int32x2_t = vdup_n_s32(a); + let b: int32x2_t = vdup_n_s32(b); + simd_extract(vqdmulh_s32(a, b), 0) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16 { + static_assert_imm2!(N); + let b: i16 = simd_extract(b, N as u32); + vqdmulhh_s16(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16 { + static_assert_imm3!(N); + let b: i16 = simd_extract(b, N as u32); + vqdmulhh_s16(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh, N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32 { + static_assert_imm1!(N); + let b: i32 = simd_extract(b, N as u32); + vqdmulhs_s32(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(sqdmulh, N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 { + static_assert_imm2!(N); + let b: i32 = simd_extract(b, N as u32); + vqdmulhs_s32(a, b) +} + /// Signed saturating rounding shift left #[inline] #[target_feature(enable = "neon")] @@ -8011,6 +8596,78 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqsubb_s8() { + let a: i8 = 42; + let b: i8 = 1; + let e: i8 = 41; + let r: i8 = transmute(vqsubb_s8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubh_s16() { + let a: i16 = 42; + let b: i16 = 1; + let e: i16 = 41; + let r: i16 = transmute(vqsubh_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubs_s32() { + let a: i32 = 42; + let b: i32 = 1; + let e: i32 = 41; + let r: i32 = transmute(vqsubs_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubd_s64() { + let a: i64 = 42; + let b: i64 = 1; + let e: i64 = 41; + let r: i64 = transmute(vqsubd_s64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubb_u8() { + let a: u8 = 42; + let b: u8 = 1; + let e: u8 = 41; + let r: u8 = transmute(vqsubb_u8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubh_u16() { + let a: u16 = 42; + let b: u16 = 1; + let e: u16 = 41; + let r: u16 = transmute(vqsubh_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubs_u32() { + let a: u32 = 42; + let b: u32 = 1; + let e: u32 = 41; + let r: u32 = transmute(vqsubs_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqsubd_u64() { + let a: u64 = 42; + let b: u64 = 1; + let e: u64 = 41; + let r: u64 = transmute(vqsubd_u64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vrbit_s8() { let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14); @@ -8283,6 +8940,78 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqaddb_s8() { + let a: i8 = 42; + let b: i8 = 1; + let e: i8 = 43; + let r: i8 = transmute(vqaddb_s8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqaddh_s16() { + let a: i16 = 42; + let b: i16 = 1; + let e: i16 = 43; + let r: i16 = transmute(vqaddh_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqadds_s32() { + let a: i32 = 42; + let b: i32 = 1; + let e: i32 = 43; + let r: i32 = transmute(vqadds_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqaddd_s64() { + let a: i64 = 42; + let b: i64 = 1; + let e: i64 = 43; + let r: i64 = transmute(vqaddd_s64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqaddb_u8() { + let a: u8 = 42; + let b: u8 = 1; + let e: u8 = 43; + let r: u8 = transmute(vqaddb_u8(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqaddh_u16() { + let a: u16 = 42; + let b: u16 = 1; + let e: u16 = 43; + let r: u16 = transmute(vqaddh_u16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqadds_u32() { + let a: u32 = 42; + let b: u32 = 1; + let e: u32 = 43; + let r: u32 = transmute(vqadds_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqaddd_u64() { + let a: u64 = 42; + let b: u64 = 1; + let e: u64 = 43; + let r: u64 = transmute(vqaddd_u64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; @@ -8692,6 +9421,404 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqdmullh_s16() { + let a: i16 = 2; + let b: i16 = 3; + let e: i32 = 12; + let r: i32 = transmute(vqdmullh_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulls_s32() { + let a: i32 = 2; + let b: i32 = 3; + let e: i64 = 12; + let r: i64 = transmute(vqdmulls_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_s16() { + let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let b: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8); + let e: i32x4 = i32x4::new(40, 60, 84, 112); + let r: i32x4 = transmute(vqdmull_high_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_s32() { + let a: i32x4 = i32x4::new(0, 1, 4, 5); + let b: i32x4 = i32x4::new(1, 2, 5, 6); + let e: i64x2 = i64x2::new(40, 60); + let r: i64x2 = transmute(vqdmull_high_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_n_s16() { + let a: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14); + let b: i16 = 2; + let e: i32x4 = i32x4::new(32, 40, 48, 56); + let r: i32x4 = transmute(vqdmull_high_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_n_s32() { + let a: i32x4 = i32x4::new(0, 2, 8, 10); + let b: i32 = 2; + let e: i64x2 = i64x2::new(32, 40); + let r: i64x2 = transmute(vqdmull_high_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_laneq_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0); + let e: i32x4 = i32x4::new(4, 8, 12, 16); + let r: i32x4 = transmute(vqdmull_laneq_s16::<4>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_laneq_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 2, 2, 0); + let e: i64x2 = i64x2::new(4, 8); + let r: i64x2 = transmute(vqdmull_laneq_s32::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmullh_lane_s16() { + let a: i16 = 2; + let b: i16x4 = i16x4::new(0, 2, 2, 0); + let e: i32 = 8; + let r: i32 = transmute(vqdmullh_lane_s16::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmullh_laneq_s16() { + let a: i16 = 2; + let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0); + let e: i32 = 8; + let r: i32 = transmute(vqdmullh_laneq_s16::<4>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulls_lane_s32() { + let a: i32 = 2; + let b: i32x2 = i32x2::new(0, 2); + let e: i64 = 8; + let r: i64 = transmute(vqdmulls_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulls_laneq_s32() { + let a: i32 = 2; + let b: i32x4 = i32x4::new(0, 2, 2, 0); + let e: i64 = 8; + let r: i64 = transmute(vqdmulls_laneq_s32::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_lane_s16() { + let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let b: i16x4 = i16x4::new(0, 2, 2, 0); + let e: i32x4 = i32x4::new(16, 20, 24, 28); + let r: i32x4 = transmute(vqdmull_high_lane_s16::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_lane_s32() { + let a: i32x4 = i32x4::new(0, 1, 4, 5); + let b: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(16, 20); + let r: i64x2 = transmute(vqdmull_high_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_laneq_s16() { + let a: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let b: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0); + let e: i32x4 = i32x4::new(16, 20, 24, 28); + let r: i32x4 = transmute(vqdmull_high_laneq_s16::<4>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_high_laneq_s32() { + let a: i32x4 = i32x4::new(0, 1, 4, 5); + let b: i32x4 = i32x4::new(0, 2, 2, 0); + let e: i64x2 = i64x2::new(16, 20); + let r: i64x2 = transmute(vqdmull_high_laneq_s32::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8); + let e: i32x4 = i32x4::new(41, 62, 87, 116); + let r: i32x4 = transmute(vqdmlal_high_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x4 = i32x4::new(1, 2, 5, 6); + let e: i64x2 = i64x2::new(41, 62); + let r: i64x2 = transmute(vqdmlal_high_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_n_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14); + let c: i16 = 2; + let e: i32x4 = i32x4::new(33, 42, 51, 60); + let r: i32x4 = transmute(vqdmlal_high_n_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_n_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 2, 8, 10); + let c: i32 = 2; + let e: i64x2 = i64x2::new(33, 42); + let r: i64x2 = transmute(vqdmlal_high_n_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_laneq_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0); + let e: i32x4 = i32x4::new(5, 10, 15, 20); + let r: i32x4 = transmute(vqdmlal_laneq_s16::<2>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_laneq_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x4 = i32x4::new(0, 2, 2, 0); + let e: i64x2 = i64x2::new(5, 10); + let r: i64x2 = transmute(vqdmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_lane_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(17, 22, 27, 32); + let r: i32x4 = transmute(vqdmlal_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_laneq_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i32x4 = i32x4::new(17, 22, 27, 32); + let r: i32x4 = transmute(vqdmlal_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_lane_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(17, 22); + let r: i64x2 = transmute(vqdmlal_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_high_laneq_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i64x2 = i64x2::new(17, 22); + let r: i64x2 = transmute(vqdmlal_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_s16() { + let a: i32x4 = i32x4::new(39, 58, 81, 108); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x8 = i16x8::new(1, 2, 5, 6, 5, 6, 7, 8); + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_high_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_s32() { + let a: i64x2 = i64x2::new(39, 58); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x4 = i32x4::new(1, 2, 5, 6); + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_high_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_n_s16() { + let a: i32x4 = i32x4::new(31, 38, 45, 52); + let b: i16x8 = i16x8::new(0, 2, 8, 10, 8, 10, 12, 14); + let c: i16 = 2; + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_high_n_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_n_s32() { + let a: i64x2 = i64x2::new(31, 38); + let b: i32x4 = i32x4::new(0, 2, 8, 10); + let c: i32 = 2; + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_high_n_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_laneq_s16() { + let a: i32x4 = i32x4::new(3, 6, 9, 12); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x8 = i16x8::new(0, 2, 2, 0, 2, 0, 0, 0); + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_laneq_s16::<2>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_laneq_s32() { + let a: i64x2 = i64x2::new(3, 6); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x4 = i32x4::new(0, 2, 2, 0); + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_lane_s16() { + let a: i32x4 = i32x4::new(15, 18, 21, 24); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_laneq_s16() { + let a: i32x4 = i32x4::new(15, 18, 21, 24); + let b: i16x8 = i16x8::new(0, 1, 4, 5, 4, 5, 6, 7); + let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_lane_s32() { + let a: i64x2 = i64x2::new(15, 18); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_high_laneq_s32() { + let a: i64x2 = i64x2::new(15, 18); + let b: i32x4 = i32x4::new(0, 1, 4, 5); + let c: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhh_s16() { + let a: i16 = 1; + let b: i16 = 2; + let e: i16 = 0; + let r: i16 = transmute(vqdmulhh_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhs_s32() { + let a: i32 = 1; + let b: i32 = 2; + let e: i32 = 0; + let r: i32 = transmute(vqdmulhs_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhh_lane_s16() { + let a: i16 = 2; + let b: i16x4 = i16x4::new(0, 0, 0x7F_FF, 0); + let e: i16 = 1; + let r: i16 = transmute(vqdmulhh_lane_s16::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhh_laneq_s16() { + let a: i16 = 2; + let b: i16x8 = i16x8::new(0, 0, 0x7F_FF, 0, 0, 0, 0, 0); + let e: i16 = 1; + let r: i16 = transmute(vqdmulhh_laneq_s16::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhs_lane_s32() { + let a: i32 = 2; + let b: i32x2 = i32x2::new(0, 0x7F_FF_FF_FF); + let e: i32 = 1; + let r: i32 = transmute(vqdmulhs_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhs_laneq_s32() { + let a: i32 = 2; + let b: i32x4 = i32x4::new(0, 0x7F_FF_FF_FF, 0, 0); + let e: i32 = 1; + let r: i32 = transmute(vqdmulhs_laneq_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vqrshlb_s8() { let a: i8 = 1; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index fcf8ad4eaa..a84f511210 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -3567,7 +3567,7 @@ vqnegq_s32_(a) pub unsafe fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v8i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i8")] fn vqsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; } @@ -3583,7 +3583,7 @@ vqsub_u8_(a, b) pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v16i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v16i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v16i8")] fn vqsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; } @@ -3599,7 +3599,7 @@ vqsubq_u8_(a, b) pub unsafe fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v4i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i16")] fn vqsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; } @@ -3615,7 +3615,7 @@ vqsub_u16_(a, b) pub unsafe fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v8i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i16")] fn vqsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; } @@ -3631,7 +3631,7 @@ vqsubq_u16_(a, b) pub unsafe fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v2i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i32")] fn vqsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; } @@ -3647,7 +3647,7 @@ vqsub_u32_(a, b) pub unsafe fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v4i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i32")] fn vqsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; } @@ -3663,7 +3663,7 @@ vqsubq_u32_(a, b) pub unsafe fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v1i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v1i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v1i64")] fn vqsub_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; } @@ -3679,7 +3679,7 @@ vqsub_u64_(a, b) pub unsafe fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubu.v2i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i64")] fn vqsubq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; } @@ -3695,7 +3695,7 @@ vqsubq_u64_(a, b) pub unsafe fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v8i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i8")] fn vqsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t; } @@ -3711,7 +3711,7 @@ vqsub_s8_(a, b) pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v16i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v16i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v16i8")] fn vqsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t; } @@ -3727,7 +3727,7 @@ vqsubq_s8_(a, b) pub unsafe fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v4i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i16")] fn vqsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t; } @@ -3743,7 +3743,7 @@ vqsub_s16_(a, b) pub unsafe fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v8i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i16")] fn vqsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t; } @@ -3759,7 +3759,7 @@ vqsubq_s16_(a, b) pub unsafe fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v2i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i32")] fn vqsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t; } @@ -3775,7 +3775,7 @@ vqsub_s32_(a, b) pub unsafe fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v4i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i32")] fn vqsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t; } @@ -3791,7 +3791,7 @@ vqsubq_s32_(a, b) pub unsafe fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v1i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v1i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v1i64")] fn vqsub_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t; } @@ -3807,7 +3807,7 @@ vqsub_s64_(a, b) pub unsafe fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqsubs.v2i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i64")] fn vqsubq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t; } @@ -4207,7 +4207,7 @@ vrhaddq_s32_(a, b) pub unsafe fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v8i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i8")] fn vqadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; } @@ -4223,7 +4223,7 @@ vqadd_u8_(a, b) pub unsafe fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v16i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v16i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v16i8")] fn vqaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; } @@ -4239,7 +4239,7 @@ vqaddq_u8_(a, b) pub unsafe fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v4i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i16")] fn vqadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; } @@ -4255,7 +4255,7 @@ vqadd_u16_(a, b) pub unsafe fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v8i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i16")] fn vqaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; } @@ -4271,7 +4271,7 @@ vqaddq_u16_(a, b) pub unsafe fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v2i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i32")] fn vqadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; } @@ -4287,7 +4287,7 @@ vqadd_u32_(a, b) pub unsafe fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v4i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i32")] fn vqaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; } @@ -4303,7 +4303,7 @@ vqaddq_u32_(a, b) pub unsafe fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v1i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v1i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v1i64")] fn vqadd_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; } @@ -4319,7 +4319,7 @@ vqadd_u64_(a, b) pub unsafe fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqaddu.v2i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i64")] fn vqaddq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; } @@ -4335,7 +4335,7 @@ vqaddq_u64_(a, b) pub unsafe fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v8i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i8")] fn vqadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t; } @@ -4351,7 +4351,7 @@ vqadd_s8_(a, b) pub unsafe fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v16i8")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v16i8")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v16i8")] fn vqaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t; } @@ -4367,7 +4367,7 @@ vqaddq_s8_(a, b) pub unsafe fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v4i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i16")] fn vqadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t; } @@ -4383,7 +4383,7 @@ vqadd_s16_(a, b) pub unsafe fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v8i16")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i16")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i16")] fn vqaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t; } @@ -4399,7 +4399,7 @@ vqaddq_s16_(a, b) pub unsafe fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v2i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i32")] fn vqadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t; } @@ -4415,7 +4415,7 @@ vqadd_s32_(a, b) pub unsafe fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v4i32")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i32")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i32")] fn vqaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t; } @@ -4431,7 +4431,7 @@ vqaddq_s32_(a, b) pub unsafe fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v1i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v1i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v1i64")] fn vqadd_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t; } @@ -4447,7 +4447,7 @@ vqadd_s64_(a, b) pub unsafe fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t { #[allow(improper_ctypes)] extern "C" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqadds.v2i64")] + #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i64")] #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i64")] fn vqaddq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t; } @@ -5908,6 +5908,320 @@ pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { vminnmq_f32_(a, b) } +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))] +pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")] + fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t; + } +vqdmull_s16_(a, b) +} + +/// Signed saturating doubling multiply long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))] +pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")] + fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t; + } +vqdmull_s32_(a, b) +} + +/// Vector saturating doubling long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))] +pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t { + vqdmull_s16(a, vdup_n_s16(b)) +} + +/// Vector saturating doubling long multiply with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))] +pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t { + vqdmull_s32(a, vdup_n_s32(b)) +} + +/// Vector saturating doubling long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]); + vqdmull_s16(a, b) +} + +/// Vector saturating doubling long multiply by scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vqdmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]); + vqdmull_s32(a, b) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))] +pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + vqaddq_s32(a, vqdmull_s16(b, c)) +} + +/// Signed saturating doubling multiply-add long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))] +pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + vqaddq_s64(a, vqdmull_s32(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))] +pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t { + vqaddq_s32(a, vqdmull_n_s16(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))] +pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t { + vqaddq_s64(a, vqdmull_n_s32(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + vqaddq_s32(a, vqdmull_lane_s16::(b, c)) +} + +/// Vector widening saturating doubling multiply accumulate with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlal_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + vqaddq_s64(a, vqdmull_lane_s32::(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))] +pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + vqsubq_s32(a, vqdmull_s16(b, c)) +} + +/// Signed saturating doubling multiply-subtract long +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))] +pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + vqsubq_s64(a, vqdmull_s32(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))] +pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t { + vqsubq_s32(a, vqdmull_n_s16(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))] +pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t { + vqsubq_s64(a, vqdmull_n_s32(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t { + static_assert_imm2!(N); + vqsubq_s32(a, vqdmull_lane_s16::(b, c)) +} + +/// Vector widening saturating doubling multiply subtract with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqdmlsl_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t { + static_assert_imm1!(N); + vqsubq_s64(a, vqdmull_lane_s32::(b, c)) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")] + fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t; + } +vqdmulh_s16_(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")] + fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t; + } +vqdmulhq_s16_(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")] + fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t; + } +vqdmulh_s32_(a, b) +} + +/// Signed saturating doubling multiply returning high half +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "C" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")] + fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t; + } +vqdmulhq_s32_(a, b) +} + +/// Vector saturating doubling multiply high with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t { + let b: int16x4_t = vdup_n_s16(b); + vqdmulh_s16(a, b) +} + +/// Vector saturating doubling multiply high with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t { + let b: int32x2_t = vdup_n_s32(b); + vqdmulh_s32(a, b) +} + +/// Vector saturating doubling multiply high with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t { + let b: int16x8_t = vdupq_n_s16(b); + vqdmulhq_s16(a, b) +} + +/// Vector saturating doubling multiply high with scalar +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] +pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t { + let b: int32x4_t = vdupq_n_s32(b); + vqdmulhq_s32(a, b) +} + /// Signed saturating rounding shift left #[inline] #[target_feature(enable = "neon")] @@ -15760,6 +16074,252 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_s16() { + let a: i16x4 = i16x4::new(0, 1, 2, 3); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let e: i32x4 = i32x4::new(0, 4, 12, 24); + let r: i32x4 = transmute(vqdmull_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_s32() { + let a: i32x2 = i32x2::new(0, 1); + let b: i32x2 = i32x2::new(1, 2); + let e: i64x2 = i64x2::new(0, 4); + let r: i64x2 = transmute(vqdmull_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_n_s16() { + let a: i16x4 = i16x4::new(2, 4, 6, 8); + let b: i16 = 2; + let e: i32x4 = i32x4::new(8, 16, 24, 32); + let r: i32x4 = transmute(vqdmull_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_n_s32() { + let a: i32x2 = i32x2::new(2, 4); + let b: i32 = 2; + let e: i64x2 = i64x2::new(8, 16); + let r: i64x2 = transmute(vqdmull_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_lane_s16() { + let a: i16x4 = i16x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(0, 2, 2, 0); + let e: i32x4 = i32x4::new(4, 8, 12, 16); + let r: i32x4 = transmute(vqdmull_lane_s16::<2>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmull_lane_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(4, 8); + let r: i64x2 = transmute(vqdmull_lane_s32::<1>(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_s16() { + let a: i32x4 = i32x4::new(1, 1, 1, 1); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x4 = i16x4::new(2, 2, 2, 2); + let e: i32x4 = i32x4::new(5, 9, 13, 17); + let r: i32x4 = transmute(vqdmlal_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_s32() { + let a: i64x2 = i64x2::new(1, 1); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x2 = i32x2::new(2, 2); + let e: i64x2 = i64x2::new(5, 9); + let r: i64x2 = transmute(vqdmlal_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_n_s16() { + let a: i32x4 = i32x4::new(1, 1, 1, 1); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16 = 2; + let e: i32x4 = i32x4::new(5, 9, 13, 17); + let r: i32x4 = transmute(vqdmlal_n_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_n_s32() { + let a: i64x2 = i64x2::new(1, 1); + let b: i32x2 = i32x2::new(1, 2); + let c: i32 = 2; + let e: i64x2 = i64x2::new(5, 9); + let r: i64x2 = transmute(vqdmlal_n_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_lane_s16() { + let a: i32x4 = i32x4::new(1, 2, 3, 4); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x4 = i16x4::new(0, 2, 2, 0); + let e: i32x4 = i32x4::new(5, 10, 15, 20); + let r: i32x4 = transmute(vqdmlal_lane_s16::<2>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlal_lane_s32() { + let a: i64x2 = i64x2::new(1, 2); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(5, 10); + let r: i64x2 = transmute(vqdmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_s16() { + let a: i32x4 = i32x4::new(3, 7, 11, 15); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x4 = i16x4::new(2, 2, 2, 2); + let e: i32x4 = i32x4::new(-1, -1, -1, -1); + let r: i32x4 = transmute(vqdmlsl_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_s32() { + let a: i64x2 = i64x2::new(3, 7); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x2 = i32x2::new(2, 2); + let e: i64x2 = i64x2::new(-1, -1); + let r: i64x2 = transmute(vqdmlsl_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_n_s16() { + let a: i32x4 = i32x4::new(3, 7, 11, 15); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16 = 2; + let e: i32x4 = i32x4::new(-1, -1, -1, -1); + let r: i32x4 = transmute(vqdmlsl_n_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_n_s32() { + let a: i64x2 = i64x2::new(3, 7); + let b: i32x2 = i32x2::new(1, 2); + let c: i32 = 2; + let e: i64x2 = i64x2::new(-1, -1); + let r: i64x2 = transmute(vqdmlsl_n_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_lane_s16() { + let a: i32x4 = i32x4::new(3, 6, 9, 12); + let b: i16x4 = i16x4::new(1, 2, 3, 4); + let c: i16x4 = i16x4::new(0, 2, 2, 0); + let e: i32x4 = i32x4::new(-1, -2, -3, -4); + let r: i32x4 = transmute(vqdmlsl_lane_s16::<2>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmlsl_lane_s32() { + let a: i64x2 = i64x2::new(3, 6); + let b: i32x2 = i32x2::new(1, 2); + let c: i32x2 = i32x2::new(0, 2); + let e: i64x2 = i64x2::new(-1, -2); + let r: i64x2 = transmute(vqdmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulh_s16() { + let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let b: i16x4 = i16x4::new(2, 2, 2, 2); + let e: i16x4 = i16x4::new(1, 1, 1, 1); + let r: i16x4 = transmute(vqdmulh_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhq_s16() { + let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); + let r: i16x8 = transmute(vqdmulhq_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulh_s32() { + let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let b: i32x2 = i32x2::new(2, 2); + let e: i32x2 = i32x2::new(1, 1); + let r: i32x2 = transmute(vqdmulh_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhq_s32() { + let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let b: i32x4 = i32x4::new(2, 2, 2, 2); + let e: i32x4 = i32x4::new(1, 1, 1, 1); + let r: i32x4 = transmute(vqdmulhq_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulh_n_s16() { + let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let b: i16 = 2; + let e: i16x4 = i16x4::new(1, 1, 1, 1); + let r: i16x4 = transmute(vqdmulh_n_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulh_n_s32() { + let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let b: i32 = 2; + let e: i32x2 = i32x2::new(1, 1); + let r: i32x2 = transmute(vqdmulh_n_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhq_nq_s16() { + let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let b: i16 = 2; + let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); + let r: i16x8 = transmute(vqdmulhq_nq_s16(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqdmulhq_nq_s32() { + let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let b: i32 = 2; + let e: i32x4 = i32x4::new(1, 1, 1, 1); + let r: i32x4 = transmute(vqdmulhq_nq_s32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vqrshl_s8() { let a: i8x8 = i8x8::new(-128, 0x7F, 2, 3, 4, 5, 6, 7); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 1f0d70362c..7c99272c12 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -1296,16 +1296,30 @@ validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 arm = vqsub.s aarch64 = uqsub -link-arm = vqsubu._EXT_ +link-arm = llvm.usub.sat._EXT_ link-aarch64 = uqsub._EXT_ generate uint*_t, uint64x*_t arm = vqsub.s aarch64 = sqsub -link-arm = vqsubs._EXT_ +link-arm = llvm.ssub.sat._EXT_ link-aarch64 = sqsub._EXT_ generate int*_t, int64x*_t +/// Saturating subtract +name = vqsub +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0 +a = 42 +b = 1 +validate 41 + +aarch64 = sqsub +generate i8, i16, i32, i64 +aarch64 = uqsub +generate u8, u16, u32, u64 + /// Halving add name = vhadd a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 @@ -1433,16 +1447,30 @@ validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 arm = vqadd.s aarch64 = uqadd -link-arm = vqaddu._EXT_ +link-arm = llvm.uadd.sat._EXT_ link-aarch64 = uqadd._EXT_ generate uint*_t, uint64x*_t arm = vqadd.s aarch64 = sqadd -link-arm = vqadds._EXT_ +link-arm = llvm.sadd.sat._EXT_ link-aarch64 = sqadd._EXT_ generate int*_t, int64x*_t +/// Saturating add +name = vqadd +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0 +a = 42 +b = 1 +validate 43 + +aarch64 = sqadd +generate i8, i16, i32, i64 +aarch64 = uqadd +generate u8, u16, u32, u64 + /// Multiply name = vmul a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 @@ -2063,6 +2091,395 @@ aarch64 = fminnmp link-aarch64 = fminnmp._EXT_ generate float32x4_t:float32x4_t:float32x4_t +/// Signed saturating doubling multiply long +name = vqdmull +a = 0, 1, 2, 3, 4, 5, 6, 7 +b = 1, 2, 3, 4, 5, 6, 7, 8 +validate 0, 4, 12, 24, 40, 60, 84, 108 + +aarch64 = sqdmull +link-aarch64 = sqdmull._EXT2_ +arm = vqdmull +link-arm = vqdmull._EXT2_ +generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0 +a = 2 +b = 3 +validate 12 + +aarch64 = sqdmull +generate i16:i16:i32 + +/// Signed saturating doubling multiply long +name = vqdmull +a = 2 +b = 3 +validate 12 + +aarch64 = sqdmull +link-aarch64 = sqdmulls.scalar +generate i32:i32:i64 + +/// Vector saturating doubling long multiply with scalar +name = vqdmull_n +no-q +multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} +a = 2, 4, 6, 8 +b = 2 +validate 8, 16, 24, 32 + +aarch64 = sqdmull +arm = vqdmull +generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high +no-q +multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-halflen-halflen} +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {asc-halflen-halflen} +multi_fn = vqdmull-noqself-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 1, 2, 5, 6, 5, 6, 7, 8 +validate 40, 60, 84, 112 + +aarch64 = sqdmull2 +generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high_n +no-q +multi_fn = simd_shuffle-out_len-noext, a:in_ntt, a, a, {asc-out_len-out_len} +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vqdmull-in_ntt-noext, a, b +a = 0, 2, 8, 10, 8, 10, 12, 14 +b = 2 +validate 32, 40, 48, 56 + +aarch64 = sqdmull2 +generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t + +/// Vector saturating doubling long multiply by scalar +name = vqdmull_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-noext, b:in_t0, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-noqself-noext, a, b +a = 1, 2, 3, 4 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 4, 8, 12, 16 + +aarch64 = sqdmull +generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t + +arm = vqdmull +generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmullh_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmullh-noqself-noext, a, b +a = 2 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 8 + +aarch64 = sqdmull +generate i16:int16x4_t:i32, i16:int16x8_t:i32 + +/// Signed saturating doubling multiply long +name = vqdmulls_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulls-noqself-noext, a, b +a = 2 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 8 + +aarch64 = sqdmull +generate i32:int32x2_t:i64, i32:int32x4_t:i64 + +/// Signed saturating doubling multiply long +name = vqdmull_high_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-noext, a:in_t, a, a, {asc-out_len-out_len} +multi_fn = simd_shuffle-out_len-noext, b:in_t, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-self-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 16, 20, 24, 28 + +aarch64 = sqdmull2 +generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply long +name = vqdmull_high_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_shuffle-out_len-noext, a:half, a, a, {asc-out_len-out_len} +multi_fn = simd_shuffle-out_len-noext, b:half, b, b, {dup-out_len-N as u32} +multi_fn = vqdmull-noqself-noext, a, b +a = 0, 1, 4, 5, 4, 5, 6, 7 +b = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 16, 20, 24, 28 + +aarch64 = sqdmull2 +generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal +multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} +a = 1, 1, 1, 1 +b = 1, 2, 3, 4 +c = 2, 2, 2, 2 +validate 5, 9, 13, 17 + +aarch64 = sqdmlal +arm = vqdmlal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Vector widening saturating doubling multiply accumulate with scalar +name = vqdmlal +n-suffix +multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} +a = 1, 1, 1, 1 +b = 1, 2, 3, 4 +c = 2 +validate 5, 9, 13, 17 + +aarch64 = sqdmlal +arm = vqdmlal +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high +no-q +multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} +a = 1, 2, 3, 4 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 1, 2, 5, 6, 5, 6, 7, 8 +validate 41, 62, 87, 116 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high_n +no-q +multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} +a = 1, 2, 3, 4 +b = 0, 2, 8, 10, 8, 10, 12, 14 +c = 2 +validate 33, 42, 51, 60 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t + +/// Vector widening saturating doubling multiply accumulate with scalar +name = vqdmlal_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::, b, c} +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate 5, 10, 15, 20 + +aarch64 = sqdmlal +generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t + +arm = vqdmlal +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply-add long +name = vqdmlal_high_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::, b, c} +a = 1, 2, 3, 4 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate 17, 22, 27, 32 + +aarch64 = sqdmlal2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl +multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} +a = 3, 7, 11, 15 +b = 1, 2, 3, 4 +c = 2, 2, 2, 2 +validate -1, -1, -1, -1 + +aarch64 = sqdmlsl +arm = vqdmlsl +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Vector widening saturating doubling multiply subtract with scalar +name = vqdmlsl +n-suffix +multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} +a = 3, 7, 11, 15 +b = 1, 2, 3, 4 +c = 2 +validate -1, -1, -1, -1 + +aarch64 = sqdmlsl +arm = vqdmlsl +generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high +no-q +multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} +a = 39, 58, 81, 108 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 1, 2, 5, 6, 5, 6, 7, 8 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high_n +no-q +multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} +a = 31, 38, 45, 52 +b = 0, 2, 8, 10, 8, 10, 12, 14 +c = 2 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t + +/// Vector widening saturating doubling multiply subtract with scalar +name = vqdmlsl_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::, b, c} +a = 3, 6, 9, 12 +b = 1, 2, 3, 4 +c = 0, 2, 2, 0, 2, 0, 0, 0 +n = HFLEN +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl +generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t + +arm = vqdmlsl +generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t + +/// Signed saturating doubling multiply-subtract long +name = vqdmlsl_high_lane +in2-suffix +constn = N +multi_fn = static_assert_imm-in2_exp_len-N +multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::, b, c} +a = 15, 18, 21, 24 +b = 0, 1, 4, 5, 4, 5, 6, 7 +c = 0, 2, 0, 0, 0, 0, 0, 0 +n = 1 +validate -1, -2, -3, -4 + +aarch64 = sqdmlsl2 +generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t + +/// Signed saturating doubling multiply returning high half +name = vqdmulh +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2, 2, 2, 2, 2, 2, 2, 2 +validate 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = sqdmulh +link-aarch64 = sqdmulh._EXT_ +arm = vqdmulh +link-arm = vqdmulh._EXT_ +generate int16x4_t, int16x8_t, int32x2_t, int32x4_t + +/// Signed saturating doubling multiply returning high half +name = vqdmulh +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0 +a = 1 +b = 2 +validate 0 + +aarch64 = sqdmulh +generate i16, i32 + +/// Vector saturating doubling multiply high with scalar +name = vqdmulh_n +out-suffix +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vqdmulh-out-noext, a, b +a = MAX, MAX, MAX, MAX +b = 2 +validate 1, 1, 1, 1 + +aarch64 = sqdmulh +arm = vqdmulh +generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t + +/// Vector saturating doubling multiply high with scalar +name = vqdmulhq_n +out-suffix +multi_fn = vdupq_n-in_ntt-noext, b:out_t, b +multi_fn = vqdmulh-out-noext, a, b +a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX +b = 2 +validate 1, 1, 1, 1, 1, 1, 1, 1 + +aarch64 = sqdmulh +arm = vqdmulh +generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t + +/// Signed saturating doubling multiply returning high half +name = vqdmulhh_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulhh-out_ntt-noext, a, b +a = 2 +b = 0, 0, MAX, 0, 0, 0, 0, 0 +n = 2 +validate 1 + +aarch64 = sqdmulh +generate i16:int16x4_t:i16, i16:int16x8_t:i16 + +/// Signed saturating doubling multiply returning high half +name = vqdmulhs_lane +constn = N +multi_fn = static_assert_imm-in_exp_len-N +multi_fn = simd_extract, b:in_t0, b, N as u32 +multi_fn = vqdmulhs-out_ntt-noext, a, b +a = 2 +b = 0, MAX, 0, 0 +n = 1 +validate 1 + +aarch64 = sqdmulh +generate i32:int32x2_t:i32, i32:int32x4_t:i32 + /// Signed saturating rounding shift left name = vqrshl a = MIN, MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index ab5a396c36..d8326a3e2b 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -320,10 +320,10 @@ fn type_to_noq_double_suffixes<'a>(out_t: &'a str, in_t: &'a str) -> String { fn type_to_noq_suffix(t: &str) -> &str { match t { - "int8x8_t" | "int8x16_t" => "_s8", - "int16x4_t" | "int16x8_t" => "_s16", - "int32x2_t" | "int32x4_t" => "_s32", - "int64x1_t" | "int64x2_t" => "_s64", + "int8x8_t" | "int8x16_t" | "i8" => "_s8", + "int16x4_t" | "int16x8_t" | "i16" => "_s16", + "int32x2_t" | "int32x4_t" | "i32" => "_s32", + "int64x1_t" | "int64x2_t" | "i64" => "_s64", "uint8x8_t" | "uint8x16_t" => "_u8", "uint16x4_t" | "uint16x8_t" => "_u16", "uint32x2_t" | "uint32x4_t" => "_u32", @@ -348,6 +348,7 @@ enum Suffix { NoQNSuffix, OutSuffix, Lane, + In2, } #[derive(Clone, Copy)] @@ -845,6 +846,7 @@ fn gen_aarch64( NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), + In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() { @@ -1218,6 +1220,7 @@ fn gen_arm( NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), + In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), }; let current_aarch64 = current_aarch64 .clone() @@ -1729,6 +1732,7 @@ fn get_call( let start = match &*fn_format[1] { "0" => 0, "n" => n.unwrap(), + "out_len" => type_len(out_t) as i32, "halflen" => (type_len(in_t[1]) / 2) as i32, s => s.parse::().unwrap(), }; @@ -1747,6 +1751,7 @@ fn get_call( "out_bits_exp_len" => type_bits_exp_len(out_t), "in_exp_len" => type_exp_len(in_t[1]), "in_bits_exp_len" => type_bits_exp_len(in_t[1]), + "in2_exp_len" => type_exp_len(in_t[2]), _ => 0, }; if len == 0 { @@ -1922,6 +1927,10 @@ fn get_call( fn_name.push_str(type_to_suffix(in_t[1])); } else if fn_format[1] == "nself" { fn_name.push_str(type_to_n_suffix(in_t[1])); + } else if fn_format[1] == "out" { + fn_name.push_str(type_to_suffix(out_t)); + } else if fn_format[1] == "in2" { + fn_name.push_str(type_to_suffix(in_t[2])); } else if fn_format[1] == "signed" { fn_name.push_str(type_to_suffix(type_to_signed(in_t[1]))); } else if fn_format[1] == "unsigned" { @@ -2132,6 +2141,8 @@ mod test { suffix = OutSuffix; } else if line.starts_with("lane-suffixes") { suffix = Lane; + } else if line.starts_with("in2-suffix") { + suffix = In2; } else if line.starts_with("a = ") { a = line[4..].split(',').map(|v| v.trim().to_string()).collect(); } else if line.starts_with("b = ") {