@@ -580,6 +580,39 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
580
580
constify_imm4 ! ( rounding, call)
581
581
}
582
582
583
+ /// Find minimal u16 element in vector.
584
+ /// Place it in the first element of resulting vector and it's index
585
+ /// in second element (formally bits [16..18] inclusive).
586
+ /// All other elements are set to zero.
587
+ #[ inline( always) ]
588
+ #[ target_feature = "+sse4.1" ]
589
+ #[ cfg_attr( test, assert_instr( phminposuw) ) ]
590
+ pub unsafe fn _mm_minpos_epu16 ( a : u16x8 ) -> u16x8 {
591
+ phminposuw ( a)
592
+ }
593
+
594
+ /// Multiply the low 32-bit integers from each packed 64-bit element
595
+ /// in a and b, and store the signed 64-bit results in dst.
596
+ #[ inline( always) ]
597
+ #[ target_feature = "+sse4.1" ]
598
+ #[ cfg_attr( test, assert_instr( pmuldq) ) ]
599
+ pub unsafe fn _mm_mul_epi32 ( a : i32x4 , b : i32x4 ) -> i64x2 {
600
+ pmuldq ( a, b)
601
+ }
602
+
603
+ /// Multiply the packed 32-bit integers in a and b, producing intermediate
604
+ /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
605
+ /// reinterpreted as a signed integer.
606
+ /// While pmulld i32x4::splat(2), i32x4::splat(2) returns the obvious
607
+ /// i32x4::splat(4), pmulld i32x4::splat(i32::MAX), i32x4::splat(2)
608
+ /// would return a negative number.
609
+ #[ inline( always) ]
610
+ #[ target_feature = "+sse4.1" ]
611
+ #[ cfg_attr( test, assert_instr( pmulld) ) ]
612
+ pub unsafe fn _mm_mullo_epi32 ( a : i32x4 , b : i32x4 ) -> i32x4 {
613
+ a * b
614
+ }
615
+
583
616
584
617
#[ allow( improper_ctypes) ]
585
618
extern "C" {
@@ -627,6 +660,10 @@ extern "C" {
627
660
fn roundsd ( a : f64x2 , b : f64x2 , rounding : i32 ) -> f64x2 ;
628
661
#[ link_name = "llvm.x86.sse41.round.ss" ]
629
662
fn roundss ( a : f32x4 , b : f32x4 , rounding : i32 ) -> f32x4 ;
663
+ #[ link_name = "llvm.x86.sse41.phminposuw" ]
664
+ fn phminposuw ( a : u16x8 ) -> u16x8 ;
665
+ #[ link_name = "llvm.x86.sse41.pmuldq" ]
666
+ fn pmuldq ( a : i32x4 , b : i32x4 ) -> i64x2 ;
630
667
}
631
668
632
669
#[ cfg( test) ]
@@ -1109,4 +1146,46 @@ mod tests {
1109
1146
let e = f32x4:: new ( -2.0 , 3.5 , 7.5 , 15.5 ) ;
1110
1147
assert_eq ! ( r, e) ;
1111
1148
}
1149
+
1150
+ #[ simd_test = "sse4.1" ]
1151
+ unsafe fn _mm_minpos_epu16_1 ( ) {
1152
+ let a = u16x8:: new ( 23 , 18 , 44 , 97 , 50 , 13 , 67 , 66 ) ;
1153
+ let r = sse41:: _mm_minpos_epu16 ( a) ;
1154
+ let e = u16x8:: new ( 13 , 5 , 0 , 0 , 0 , 0 , 0 , 0 ) ;
1155
+ assert_eq ! ( r, e) ;
1156
+ }
1157
+
1158
+ #[ simd_test = "sse4.1" ]
1159
+ unsafe fn _mm_minpos_epu16_2 ( ) {
1160
+ let a = u16x8:: new ( 0 , 18 , 44 , 97 , 50 , 13 , 67 , 66 ) ;
1161
+ let r = sse41:: _mm_minpos_epu16 ( a) ;
1162
+ let e = u16x8:: new ( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ) ;
1163
+ assert_eq ! ( r, e) ;
1164
+ }
1165
+
1166
+ #[ simd_test = "sse4.1" ]
1167
+ unsafe fn _mm_mul_epi32 ( ) {
1168
+ let a =
1169
+ i32x4:: new ( 15 , 2 /* ignored */ , 1234567 , 4 /* ignored */ ) ;
1170
+ let b = i32x4:: new (
1171
+ -20 ,
1172
+ -256 , /* ignored */
1173
+ 666666 ,
1174
+ 666666 , /* ignored */
1175
+ ) ;
1176
+ let r = sse41:: _mm_mul_epi32 ( a, b) ;
1177
+ let e = i64x2:: new ( -300 , 823043843622 ) ;
1178
+ assert_eq ! ( r, e) ;
1179
+ }
1180
+
1181
+ #[ simd_test = "sse4.1" ]
1182
+ unsafe fn _mm_mullo_epi32 ( ) {
1183
+ let a = i32x4:: new ( 15 , -2 , 1234567 , 99999 ) ;
1184
+ let b = i32x4:: new ( -20 , -256 , 666666 , -99999 ) ;
1185
+ let r = sse41:: _mm_mullo_epi32 ( a, b) ;
1186
+ // Attention, most significant bit in r[2] is treated as a sign bit!
1187
+ // 1234567 * 666666 = -1589877210
1188
+ let e = i32x4:: new ( -300 , 512 , -1589877210 , -1409865409 ) ;
1189
+ assert_eq ! ( r, e) ;
1190
+ }
1112
1191
}
0 commit comments