Skip to content

Commit 46aceed

Browse files
Tony Sifkarovskignzlbg
Tony Sifkarovski
authored andcommitted
[avx2] add _mm_256_cvtepu{8,16,32}_epi{16,32,64} (#192)
1 parent 9b81ab7 commit 46aceed

File tree

2 files changed

+102
-13
lines changed

2 files changed

+102
-13
lines changed

src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,9 @@ mod v32 {
173173
define_ty! { i8x4, i8, i8, i8, i8 }
174174
define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 }
175175

176+
define_ty! { u8x4, u8, u8, u8, u8 }
177+
define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 }
178+
176179
define_casts!((i8x4, i32x4, as_i32x4), (i16x2, i64x2, as_i64x2));
177180
}
178181

src/x86/avx2.rs

Lines changed: 99 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -583,26 +583,66 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: i8x16) -> i32x8 {
583583
simd_cast::<::v64::i8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]))
584584
}
585585

586-
/// An i8x4 type is pretty useless, but we need it as an intermediate type in
587-
/// _mm256_cvtepi8_epi64.
588-
#[repr(simd)]
589-
#[allow(non_camel_case_types)]
590-
struct i8x4(i8, i8, i8, i8);
591-
592586
/// Sign-extend 8-bit integers to 64-bit integers.
593587
#[inline(always)]
594588
#[target_feature = "+avx2"]
595589
#[cfg_attr(test, assert_instr(vpmovsxbq))]
596590
pub unsafe fn _mm256_cvtepi8_epi64(a: i8x16) -> i64x4 {
597-
simd_cast::<i8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
591+
simd_cast::<::v32::i8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
592+
}
593+
594+
/// Zero-extend the lower four unsigned 16-bit integers in `a` to 32-bit
595+
/// integers. The upper four elements of `a` are unused.
596+
#[inline(always)]
597+
#[target_feature = "+avx2"]
598+
#[cfg_attr(test, assert_instr(vpmovzxwd))]
599+
pub unsafe fn _mm256_cvtepu16_epi32(a: u16x8) -> i32x8 {
600+
simd_cast(a)
601+
}
602+
603+
/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
604+
/// integers. The upper four elements of `a` are unused.
605+
#[inline(always)]
606+
#[target_feature = "+avx2"]
607+
#[cfg_attr(test, assert_instr(vpmovzxwq))]
608+
pub unsafe fn _mm256_cvtepu16_epi64(a: u16x8) -> i64x4 {
609+
simd_cast::<::v64::u16x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
610+
}
611+
612+
/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
613+
#[inline(always)]
614+
#[target_feature = "+avx2"]
615+
#[cfg_attr(test, assert_instr(vpmovzxdq))]
616+
pub unsafe fn _mm256_cvtepu32_epi64(a: u32x4) -> i64x4 {
617+
simd_cast(a)
618+
}
619+
620+
/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
621+
#[inline(always)]
622+
#[target_feature = "+avx2"]
623+
#[cfg_attr(test, assert_instr(vpmovzxbw))]
624+
pub unsafe fn _mm256_cvtepu8_epi16(a: u8x16) -> i16x16 {
625+
simd_cast(a)
626+
}
627+
628+
/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
629+
/// integers. The upper eight elements of `a` are unused.
630+
#[inline(always)]
631+
#[target_feature = "+avx2"]
632+
#[cfg_attr(test, assert_instr(vpmovzxbd))]
633+
pub unsafe fn _mm256_cvtepu8_epi32(a: u8x16) -> i32x8 {
634+
simd_cast::<::v64::u8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]))
635+
}
636+
637+
/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
638+
/// integers. The upper twelve elements of `a` are unused.
639+
#[inline(always)]
640+
#[target_feature = "+avx2"]
641+
#[cfg_attr(test, assert_instr(vpmovzxbq))]
642+
pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
643+
simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
598644
}
599645

600-
// TODO _mm256_cvtepu16_epi32
601-
// TODO _mm256_cvtepu16_epi64
602-
// TODO _mm256_cvtepu32_epi64
603-
// TODO _mm256_cvtepu8_epi16
604-
// TODO _mm256_cvtepu8_epi32
605-
// TODO _mm256_cvtepu8_epi64
606646
// TODO _m128i _mm256_extracti128_si256
607647

608648
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
@@ -2738,6 +2778,52 @@ mod tests {
27382778
assert_eq!(r, avx2::_mm256_cvtepi32_epi64(a));
27392779
}
27402780

2781+
#[simd_test = "avx2"]
2782+
unsafe fn _mm256_cvtepu16_epi32() {
2783+
let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
2784+
let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
2785+
assert_eq!(r, avx2::_mm256_cvtepu16_epi32(a));
2786+
}
2787+
2788+
#[simd_test = "avx2"]
2789+
unsafe fn _mm256_cvtepu16_epi64() {
2790+
let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
2791+
let r = i64x4::new(0, 1, 2, 3);
2792+
assert_eq!(r, avx2::_mm256_cvtepu16_epi64(a));
2793+
}
2794+
2795+
#[simd_test = "avx2"]
2796+
unsafe fn _mm256_cvtepu32_epi64() {
2797+
let a = u32x4::new(0, 1, 2, 3);
2798+
let r = i64x4::new(0, 1, 2, 3);
2799+
assert_eq!(r, avx2::_mm256_cvtepu32_epi64(a));
2800+
}
2801+
2802+
#[simd_test = "avx2"]
2803+
unsafe fn _mm256_cvtepu8_epi16() {
2804+
let a =
2805+
u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2806+
let r =
2807+
i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2808+
assert_eq!(r, avx2::_mm256_cvtepu8_epi16(a));
2809+
}
2810+
2811+
#[simd_test = "avx2"]
2812+
unsafe fn _mm256_cvtepu8_epi32() {
2813+
let a =
2814+
u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2815+
let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
2816+
assert_eq!(r, avx2::_mm256_cvtepu8_epi32(a));
2817+
}
2818+
2819+
#[simd_test = "avx2"]
2820+
unsafe fn _mm256_cvtepu8_epi64() {
2821+
let a =
2822+
u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2823+
let r = i64x4::new(0, 1, 2, 3);
2824+
assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a));
2825+
}
2826+
27412827
#[simd_test = "avx2"]
27422828
unsafe fn _mm256_hadd_epi16() {
27432829
let a = i16x16::splat(2);

0 commit comments

Comments
 (0)