Skip to content

Commit 9b81ab7

Browse files
tvladyslavBurntSushi
authored andcommitted
[x86][sse4.1] Add phminposuw & pmul* instructions
pmulld is implemented via multiplication.
1 parent 663cf64 commit 9b81ab7

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

src/x86/sse41.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,39 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
580580
constify_imm4!(rounding, call)
581581
}
582582

583+
/// Find minimal u16 element in vector.
584+
/// Place it in the first element of resulting vector and it's index
585+
/// in second element (formally bits [16..18] inclusive).
586+
/// All other elements are set to zero.
587+
#[inline(always)]
588+
#[target_feature = "+sse4.1"]
589+
#[cfg_attr(test, assert_instr(phminposuw))]
590+
pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 {
591+
phminposuw(a)
592+
}
593+
594+
/// Multiply the low 32-bit integers from each packed 64-bit element
595+
/// in a and b, and store the signed 64-bit results in dst.
596+
#[inline(always)]
597+
#[target_feature = "+sse4.1"]
598+
#[cfg_attr(test, assert_instr(pmuldq))]
599+
pub unsafe fn _mm_mul_epi32(a: i32x4, b: i32x4) -> i64x2 {
600+
pmuldq(a, b)
601+
}
602+
603+
/// Multiply the packed 32-bit integers in a and b, producing intermediate
604+
/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
605+
/// reinterpreted as a signed integer.
606+
/// While pmulld i32x4::splat(2), i32x4::splat(2) returns the obvious
607+
/// i32x4::splat(4), pmulld i32x4::splat(i32::MAX), i32x4::splat(2)
608+
/// would return a negative number.
609+
#[inline(always)]
610+
#[target_feature = "+sse4.1"]
611+
#[cfg_attr(test, assert_instr(pmulld))]
612+
pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 {
613+
a * b
614+
}
615+
583616

584617
#[allow(improper_ctypes)]
585618
extern "C" {
@@ -627,6 +660,10 @@ extern "C" {
627660
fn roundsd(a: f64x2, b: f64x2, rounding: i32) -> f64x2;
628661
#[link_name = "llvm.x86.sse41.round.ss"]
629662
fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4;
663+
#[link_name = "llvm.x86.sse41.phminposuw"]
664+
fn phminposuw(a: u16x8) -> u16x8;
665+
#[link_name = "llvm.x86.sse41.pmuldq"]
666+
fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
630667
}
631668

632669
#[cfg(test)]
@@ -1109,4 +1146,46 @@ mod tests {
11091146
let e = f32x4::new(-2.0, 3.5, 7.5, 15.5);
11101147
assert_eq!(r, e);
11111148
}
1149+
1150+
#[simd_test = "sse4.1"]
1151+
unsafe fn _mm_minpos_epu16_1() {
1152+
let a = u16x8::new(23, 18, 44, 97, 50, 13, 67, 66);
1153+
let r = sse41::_mm_minpos_epu16(a);
1154+
let e = u16x8::new(13, 5, 0, 0, 0, 0, 0, 0);
1155+
assert_eq!(r, e);
1156+
}
1157+
1158+
#[simd_test = "sse4.1"]
1159+
unsafe fn _mm_minpos_epu16_2() {
1160+
let a = u16x8::new(0, 18, 44, 97, 50, 13, 67, 66);
1161+
let r = sse41::_mm_minpos_epu16(a);
1162+
let e = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
1163+
assert_eq!(r, e);
1164+
}
1165+
1166+
#[simd_test = "sse4.1"]
1167+
unsafe fn _mm_mul_epi32() {
1168+
let a =
1169+
i32x4::new(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1170+
let b = i32x4::new(
1171+
-20,
1172+
-256, /* ignored */
1173+
666666,
1174+
666666, /* ignored */
1175+
);
1176+
let r = sse41::_mm_mul_epi32(a, b);
1177+
let e = i64x2::new(-300, 823043843622);
1178+
assert_eq!(r, e);
1179+
}
1180+
1181+
#[simd_test = "sse4.1"]
1182+
unsafe fn _mm_mullo_epi32() {
1183+
let a = i32x4::new(15, -2, 1234567, 99999);
1184+
let b = i32x4::new(-20, -256, 666666, -99999);
1185+
let r = sse41::_mm_mullo_epi32(a, b);
1186+
// Attention, most significant bit in r[2] is treated as a sign bit!
1187+
// 1234567 * 666666 = -1589877210
1188+
let e = i32x4::new(-300, 512, -1589877210, -1409865409);
1189+
assert_eq!(r, e);
1190+
}
11121191
}

0 commit comments

Comments
 (0)