From 6f697f71fcc53622e866707a0191e6d0baf44c9e Mon Sep 17 00:00:00 2001 From: jirong Date: Tue, 5 Jan 2021 00:57:19 +0000 Subject: [PATCH 01/10] compress_epi16 --- crates/core_arch/src/x86/avx512vbmi.rs | 2 - crates/core_arch/src/x86/avx512vbmi2.rs | 74 ++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 3 + crates/stdarch-verify/tests/x86-intel.rs | 3 + 4 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 crates/core_arch/src/x86/avx512vbmi2.rs diff --git a/crates/core_arch/src/x86/avx512vbmi.rs b/crates/core_arch/src/x86/avx512vbmi.rs index 21437e3da2..f0ff75162f 100644 --- a/crates/core_arch/src/x86/avx512vbmi.rs +++ b/crates/core_arch/src/x86/avx512vbmi.rs @@ -438,8 +438,6 @@ mod tests { use stdarch_test::simd_test; use crate::core_arch::x86::*; - //use crate::hint::black_box; - //use crate::mem::{self}; #[simd_test(enable = "avx512vbmi")] unsafe fn test_mm512_permutex2var_epi8() { diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs new file mode 100644 index 0000000000..df9c03fe26 --- /dev/null +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -0,0 +1,74 @@ +use crate::core_arch::{simd::*, /*simd_llvm::*,*/ x86::*}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi16&expand=1192) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i { + transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k)) +} + +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi16&expand=1193) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i { + transmute(vpcompressw( + a.as_i16x32(), + _mm512_setzero_si512().as_i16x32(), + k, + )) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.mask.compress.w.512"] + fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32; + + #[link_name = "llvm.x86.avx512.mask.compress.b.512"] + fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64; +} + +#[cfg(test)] +mod tests { + + use stdarch_test::simd_test; + + use crate::core_arch::x86::*; + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_compress_epi16() { + let src = _mm512_set1_epi16(200); + #[rustfmt::skip] + let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm512_set_epi16( + 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_compress_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm512_set_epi16( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ); + assert_eq_m512i(r, e); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index b853451b49..68f1bd8991 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -678,6 +678,9 @@ pub use self::avx512ifma::*; mod avx512vbmi; pub use self::avx512vbmi::*; +mod avx512vbmi2; +pub use self::avx512vbmi2::*; + mod avx512bitalg; pub use self::avx512bitalg::*; diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 601549d5dc..51b3d054a4 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -479,6 +479,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { // The XML file names VBMI as "avx512_vbmi", while Rust calls // it "avx512vbmi". "avx512_vbmi" => String::from("avx512vbmi"), + // The XML file names VBMI2 as "avx512_vbmi2", while Rust calls + // it "avx512vbmi2". + "avx512_vbmi2" => String::from("avx512vbmi2"), // Some AVX512f intrinsics are also supported by Knight's Corner. // The XML lists them as avx512f/kncni, but we are solely gating // them behind avx512f since we don't have a KNC feature yet. From 903761fe744255f2c240d275f58cd38c634c4de6 Mon Sep 17 00:00:00 2001 From: jirong Date: Fri, 8 Jan 2021 01:20:34 +0000 Subject: [PATCH 02/10] compress_epi8 --- crates/core_arch/src/x86/avx512vbmi2.rs | 256 ++++++++++++++++++++++++ 1 file changed, 256 insertions(+) diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index df9c03fe26..9e651ab59c 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -27,13 +27,141 @@ pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i { )) } +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi16&expand=1190) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i { + transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k)) +} + +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi16&expand=1191) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i { + transmute(vpcompressw256( + a.as_i16x16(), + _mm256_setzero_si256().as_i16x16(), + k, + )) +} + +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi16&expand=1188) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { + transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k)) +} + +/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi16&expand=1189) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressw))] +pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i { + transmute(vpcompressw128( + a.as_i16x8(), + _mm_setzero_si128().as_i16x8(), + k, + )) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi8&expand=1210) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i { + transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k)) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi8&expand=1211) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i { + transmute(vpcompressb( + a.as_i8x64(), + _mm512_setzero_si512().as_i8x64(), + k, + )) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi8&expand=1208) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i { + transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k)) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi8&expand=1209) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i { + transmute(vpcompressb256( + a.as_i8x32(), + _mm256_setzero_si256().as_i8x32(), + k, + )) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi8&expand=1206) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i { + transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k)) +} + +/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi8&expand=1207) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpcompressb))] +pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i { + transmute(vpcompressb128( + a.as_i8x16(), + _mm_setzero_si128().as_i8x16(), + k, + )) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.w.512"] fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32; + #[link_name = "llvm.x86.avx512.mask.compress.w.256"] + fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16; + #[link_name = "llvm.x86.avx512.mask.compress.w.128"] + fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8; #[link_name = "llvm.x86.avx512.mask.compress.b.512"] fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64; + #[link_name = "llvm.x86.avx512.mask.compress.b.256"] + fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32; + #[link_name = "llvm.x86.avx512.mask.compress.b.128"] + fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16; } #[cfg(test)] @@ -71,4 +199,132 @@ mod tests { ); assert_eq_m512i(r, e); } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_compress_epi16() { + let src = _mm256_set1_epi16(200); + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a); + let e = _mm256_set_epi16( + 200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_compress_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a); + let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_compress_epi16() { + let src = _mm_set1_epi16(200); + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm_mask_compress_epi16(src, 0b01010101, a); + let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_compress_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm_maskz_compress_epi16(0b01010101, a); + let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_compress_epi8() { + let src = _mm512_set1_epi8(100); + #[rustfmt::skip] + let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = _mm512_mask_compress_epi8( + src, + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101, + a, + ); + #[rustfmt::skip] + let e = _mm512_set_epi8( + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_compress_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = _mm512_maskz_compress_epi8( + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101, + a, + ); + #[rustfmt::skip] + let e = _mm512_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_compress_epi8() { + let src = _mm256_set1_epi8(100); + #[rustfmt::skip] + let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm256_set_epi8( + 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_compress_epi8() { + #[rustfmt::skip] + let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm256_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_compress_epi8() { + let src = _mm_set1_epi8(100); + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a); + let e = _mm_set_epi8( + 100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15, + ); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_compress_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_maskz_compress_epi8(0b01010101_01010101, a); + let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15); + assert_eq_m128i(r, e); + } } From 33ba1beb23cac61e3368b0068f5c17f1819d68fe Mon Sep 17 00:00:00 2001 From: jirong Date: Fri, 8 Jan 2021 18:32:07 +0000 Subject: [PATCH 03/10] expand_epi16, expand_epi8 --- crates/core_arch/src/x86/avx512vbmi2.rs | 313 ++++++++++++++++++++++++ 1 file changed, 313 insertions(+) diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index 9e651ab59c..d00da42c16 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -147,6 +147,150 @@ pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i { )) } +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi16&expand=2310) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i { + transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k)) +} + +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi16&expand=2311) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i { + transmute(vpexpandw( + a.as_i16x32(), + _mm512_setzero_si512().as_i16x32(), + k, + )) +} + +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi16&expand=2308) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i { + transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k)) +} + +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi16&expand=2309) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i { + transmute(vpexpandw256( + a.as_i16x16(), + _mm256_setzero_si256().as_i16x16(), + k, + )) +} + +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi16&expand=2306) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i { + transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k)) +} + +/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi16&expand=2307) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandw))] +pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i { + transmute(vpexpandw128( + a.as_i16x8(), + _mm_setzero_si128().as_i16x8(), + k, + )) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi8&expand=2328) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i { + transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k)) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi8&expand=2329) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i { + transmute(vpexpandb( + a.as_i8x64(), + _mm512_setzero_si512().as_i8x64(), + k, + )) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi8&expand=2326) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i { + transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k)) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi8&expand=2327) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i { + transmute(vpexpandb256( + a.as_i8x32(), + _mm256_setzero_si256().as_i8x32(), + k, + )) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi8&expand=2324) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i { + transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k)) +} + +/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi8&expand=2325) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpexpandb))] +pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i { + transmute(vpexpandb128( + a.as_i8x16(), + _mm_setzero_si128().as_i8x16(), + k, + )) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.w.512"] @@ -162,6 +306,20 @@ extern "C" { fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32; #[link_name = "llvm.x86.avx512.mask.compress.b.128"] fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16; + + #[link_name = "llvm.x86.avx512.mask.expand.w.512"] + fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32; + #[link_name = "llvm.x86.avx512.mask.expand.w.256"] + fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16; + #[link_name = "llvm.x86.avx512.mask.expand.w.128"] + fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8; + + #[link_name = "llvm.x86.avx512.mask.expand.b.512"] + fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64; + #[link_name = "llvm.x86.avx512.mask.expand.b.256"] + fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32; + #[link_name = "llvm.x86.avx512.mask.expand.b.128"] + fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16; } #[cfg(test)] @@ -327,4 +485,159 @@ mod tests { let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_expand_epi16() { + let src = _mm512_set1_epi16(200); + #[rustfmt::skip] + let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm512_set_epi16( + 200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23, + 200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_expand_epi16() { + #[rustfmt::skip] + let a = _mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, + 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_expand_epi16() { + let src = _mm256_set1_epi16(200); + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a); + let e = _mm256_set_epi16( + 200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_expand_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a); + let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_expand_epi16() { + let src = _mm_set1_epi16(200); + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm_mask_expand_epi16(src, 0b01010101, a); + let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_expand_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm_maskz_expand_epi16(0b01010101, a); + let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_expand_epi8() { + let src = _mm512_set1_epi8(100); + #[rustfmt::skip] + let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = _mm512_mask_expand_epi8( + src, + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101, + a, + ); + #[rustfmt::skip] + let e = _mm512_set_epi8( + 100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39, + 100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47, + 100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55, + 100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_expand_epi8() { + #[rustfmt::skip] + let a = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + let r = _mm512_maskz_expand_epi8( + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101, + a, + ); + #[rustfmt::skip] + let e = _mm512_set_epi8( + 0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39, + 0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47, + 0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55, + 0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_expand_epi8() { + let src = _mm256_set1_epi8(100); + #[rustfmt::skip] + let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm256_set_epi8( + 100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23, + 100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_expand_epi8() { + #[rustfmt::skip] + let a = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a); + #[rustfmt::skip] + let e = _mm256_set_epi8( + 0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, + 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_expand_epi8() { + let src = _mm_set1_epi8(100); + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a); + let e = _mm_set_epi8( + 100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15, + ); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_expand_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm_maskz_expand_epi8(0b01010101_01010101, a); + let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15); + assert_eq_m128i(r, e); + } } From 5b6f90317a71607f061ab2ba357d488438fc510d Mon Sep 17 00:00:00 2001 From: jirong Date: Sat, 9 Jan 2021 01:43:06 +0000 Subject: [PATCH 04/10] shldv_epi64,epi32,epi16: mm512,mm256,mm --- crates/core_arch/src/x86/avx512vbmi2.rs | 1030 ++++++++++++++++++++++- 1 file changed, 1029 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index d00da42c16..b8bd18ef6e 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -1,4 +1,4 @@ -use crate::core_arch::{simd::*, /*simd_llvm::*,*/ x86::*}; +use crate::core_arch::{simd::*, simd_llvm::*, x86::*}; #[cfg(test)] use stdarch_test::assert_instr; @@ -291,6 +291,690 @@ pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i { )) } +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi64&expand=5087) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi64&expand=5085) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm512_mask_shldv_epi64( + a: __m512i, + k: __mmask8, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shldv_epi64(a, b, c).as_i64x8(); + transmute(simd_select_bitmask(k, shf, a.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi64&expand=5086) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shldv_epi64(a, b, c).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi64&expand=5084) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi64&expand=5082) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm256_mask_shldv_epi64( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shldv_epi64(a, b, c).as_i64x4(); + transmute(simd_select_bitmask(k, shf, a.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi64&expand=5083) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shldv_epi64(a, b, c).as_i64x4(); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi64&expand=5081) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi64&expand=5079) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm_mask_shldv_epi64( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shldv_epi64(a, b, c).as_i64x2(); + transmute(simd_select_bitmask(k, shf, a.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi64&expand=5080) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvq))] +pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shldv_epi64(a, b, c).as_i64x2(); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi32&expand=5078) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi32&expand=5076) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm512_mask_shldv_epi32( + a: __m512i, + k: __mmask16, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shldv_epi32(a, b, c).as_i32x16(); + transmute(simd_select_bitmask(k, shf, a.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi32&expand=5077) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shldv_epi32(a, b, c).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi32&expand=5075) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi32&expand=5073) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm256_mask_shldv_epi32( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shldv_epi32(a, b, c).as_i32x8(); + transmute(simd_select_bitmask(k, shf, a.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi32&expand=5074) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shldv_epi32(a, b, c).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi32&expand=5072) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi32&expand=5070) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm_mask_shldv_epi32( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shldv_epi32(a, b, c).as_i32x4(); + transmute(simd_select_bitmask(k, shf, a.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi32&expand=5071) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvd))] +pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shldv_epi32(a, b, c).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi16&expand=5069) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi16&expand=5067) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm512_mask_shldv_epi16( + a: __m512i, + k: __mmask32, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shldv_epi16(a, b, c).as_i16x32(); + transmute(simd_select_bitmask(k, shf, a.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi16&expand=5068) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shldv_epi16(a, b, c).as_i16x32(); + let zero = _mm512_setzero_si512().as_i16x32(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi16&expand=5066) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi16&expand=5064) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm256_mask_shldv_epi16( + a: __m256i, + k: __mmask16, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shldv_epi16(a, b, c).as_i16x16(); + transmute(simd_select_bitmask(k, shf, a.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi16&expand=5065) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shldv_epi16(a, b, c).as_i16x16(); + let zero = _mm256_setzero_si256().as_i16x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi16&expand=5063) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi16&expand=5061) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm_mask_shldv_epi16( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shldv_epi16(a, b, c).as_i16x8(); + transmute(simd_select_bitmask(k, shf, a.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi16&expand=5062) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldvw))] +pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shldv_epi16(a, b, c).as_i16x8(); + let zero = _mm_setzero_si128().as_i16x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi64&expand=5141) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshrdvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi64&expand=5139) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm512_mask_shrdv_epi64( + a: __m512i, + k: __mmask8, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8(); + transmute(simd_select_bitmask(k, shf, a.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi64&expand=5140) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi64&expand=5138) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshrdvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi64&expand=5136) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm256_mask_shrdv_epi64( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4(); + transmute(simd_select_bitmask(k, shf, a.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi64&expand=5137) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4(); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi64&expand=5135) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshrdvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi64&expand=5133) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm_mask_shrdv_epi64( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shrdv_epi64(a, b, c).as_i64x2(); + transmute(simd_select_bitmask(k, shf, a.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi64&expand=5134) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvq))] +pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shrdv_epi64(a, b, c).as_i64x2(); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi32&expand=5132) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshrdvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi32&expand=5130) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm512_mask_shrdv_epi32( + a: __m512i, + k: __mmask16, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16(); + transmute(simd_select_bitmask(k, shf, a.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi32&expand=5131) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi32&expand=5129) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshrdvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi32&expand=5127) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm256_mask_shrdv_epi32( + a: __m256i, + k: __mmask8, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8(); + transmute(simd_select_bitmask(k, shf, a.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi32&expand=5128) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi32&expand=5126) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshrdvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi32&expand=5124) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm_mask_shrdv_epi32( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shrdv_epi32(a, b, c).as_i32x4(); + transmute(simd_select_bitmask(k, shf, a.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi32&expand=5125) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvd))] +pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shrdv_epi32(a, b, c).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi16&expand=5123) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i { + transmute(vpshrdvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi16&expand=5121) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm512_mask_shrdv_epi16( + a: __m512i, + k: __mmask32, + b: __m512i, + c: __m512i, +) -> __m512i { + let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32(); + transmute(simd_select_bitmask(k, shf, a.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi16&expand=5122) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i { + let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32(); + let zero = _mm512_setzero_si512().as_i16x32(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi16&expand=5120) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + transmute(vpshrdvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi16&expand=5118) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm256_mask_shrdv_epi16( + a: __m256i, + k: __mmask16, + b: __m256i, + c: __m256i, +) -> __m256i { + let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16(); + transmute(simd_select_bitmask(k, shf, a.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi16&expand=5119) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i { + let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16(); + let zero = _mm256_setzero_si256().as_i16x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi16&expand=5117) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + transmute(vpshrdvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi16&expand=5115) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm_mask_shrdv_epi16( + a: __m128i, + k: __mmask8, + b: __m128i, + c: __m128i, +) -> __m128i { + let shf = _mm_shrdv_epi16(a, b, c).as_i16x8(); + transmute(simd_select_bitmask(k, shf, a.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi16&expand=5116) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshrdvw))] +pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { + let shf = _mm_shrdv_epi16(a, b, c).as_i16x8(); + let zero = _mm_setzero_si128().as_i16x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.w.512"] @@ -320,6 +1004,44 @@ extern "C" { fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32; #[link_name = "llvm.x86.avx512.mask.expand.b.128"] fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16; + + #[link_name = "llvm.fshl.v8i64"] + fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8; + #[link_name = "llvm.fshl.v4i64"] + fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4; + #[link_name = "llvm.fshl.v2i64"] + fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2; + #[link_name = "llvm.fshl.v16i32"] + fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16; + #[link_name = "llvm.fshl.v8i32"] + fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8; + #[link_name = "llvm.fshl.v4i32"] + fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4; + #[link_name = "llvm.fshl.v32i16"] + fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32; + #[link_name = "llvm.fshl.v16i16"] + fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16; + #[link_name = "llvm.fshl.v8i16"] + fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8; + + #[link_name = "llvm.fshr.v8i64"] + fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8; + #[link_name = "llvm.fshr.v4i64"] + fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4; + #[link_name = "llvm.fshr.v2i64"] + fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2; + #[link_name = "llvm.fshr.v16i32"] + fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16; + #[link_name = "llvm.fshr.v8i32"] + fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8; + #[link_name = "llvm.fshr.v4i32"] + fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4; + #[link_name = "llvm.fshr.v32i16"] + fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32; + #[link_name = "llvm.fshr.v16i16"] + fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16; + #[link_name = "llvm.fshr.v8i16"] + fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8; } #[cfg(test)] @@ -640,4 +1362,310 @@ mod tests { let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldv_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1<<63); + let c = _mm512_set1_epi64(2); + let r = _mm512_shldv_epi64(a, b, c); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldv_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1<<63); + let c = _mm512_set1_epi64(2); + let r = _mm512_mask_shldv_epi64(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldv_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1<<63); + let c = _mm512_set1_epi64(2); + let r = _mm512_maskz_shldv_epi64(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldv_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1<<63); + let c = _mm256_set1_epi64x(2); + let r = _mm256_shldv_epi64(a, b, c); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldv_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1<<63); + let c = _mm256_set1_epi64x(2); + let r = _mm256_mask_shldv_epi64(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldv_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1<<63); + let c = _mm256_set1_epi64x(2); + let r = _mm256_maskz_shldv_epi64(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldv_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1<<63); + let c = _mm_set1_epi64x(2); + let r = _mm_shldv_epi64(a, b, c); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldv_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1<<63); + let c = _mm_set1_epi64x(2); + let r = _mm_mask_shldv_epi64(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldv_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1<<63); + let c = _mm_set1_epi64x(2); + let r = _mm_maskz_shldv_epi64(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldv_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1<<31); + let c = _mm512_set1_epi32(2); + let r = _mm512_shldv_epi32(a, b, c); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldv_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1<<31); + let c = _mm512_set1_epi32(2); + let r = _mm512_mask_shldv_epi32(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldv_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1<<31); + let c = _mm512_set1_epi32(2); + let r = _mm512_maskz_shldv_epi32(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldv_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1<<31); + let c = _mm256_set1_epi32(2); + let r = _mm256_shldv_epi32(a, b, c); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldv_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1<<31); + let c = _mm256_set1_epi32(2); + let r = _mm256_mask_shldv_epi32(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldv_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1<<31); + let c = _mm256_set1_epi32(2); + let r = _mm256_maskz_shldv_epi32(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldv_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1<<31); + let c = _mm_set1_epi32(2); + let r = _mm_shldv_epi32(a, b, c); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldv_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1<<31); + let c = _mm_set1_epi32(2); + let r = _mm_mask_shldv_epi32(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldv_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1<<31); + let c = _mm_set1_epi32(2); + let r = _mm_maskz_shldv_epi32(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldv_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1<<15); + let c = _mm512_set1_epi16(2); + let r = _mm512_shldv_epi16(a, b, c); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldv_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1<<15); + let c = _mm512_set1_epi16(2); + let r = _mm512_mask_shldv_epi16(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldv_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1<<15); + let c = _mm512_set1_epi16(2); + let r = _mm512_maskz_shldv_epi16(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldv_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1<<15); + let c = _mm256_set1_epi16(2); + let r = _mm256_shldv_epi16(a, b, c); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldv_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1<<15); + let c = _mm256_set1_epi16(2); + let r = _mm256_mask_shldv_epi16(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldv_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1<<15); + let c = _mm256_set1_epi16(2); + let r = _mm256_maskz_shldv_epi16(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldv_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1<<15); + let c = _mm_set1_epi16(2); + let r = _mm_shldv_epi16(a, b, c); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldv_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1<<15); + let c = _mm_set1_epi16(2); + let r = _mm_mask_shldv_epi16(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldv_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1<<15); + let c = _mm_set1_epi16(2); + let r = _mm_maskz_shldv_epi16(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } } From adc972a4dba917ae95938d6d26e51aa2836c142d Mon Sep 17 00:00:00 2001 From: jirong Date: Sat, 9 Jan 2021 14:22:07 +0000 Subject: [PATCH 05/10] shrdv_epi64,epi32,epi16: mm512,mm256,mm --- crates/core_arch/src/x86/avx512vbmi2.rs | 528 +++++++++++++++++------- 1 file changed, 387 insertions(+), 141 deletions(-) diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index b8bd18ef6e..94ce39496b 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -307,12 +307,7 @@ pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshldvq))] -pub unsafe fn _mm512_mask_shldv_epi64( - a: __m512i, - k: __mmask8, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shldv_epi64(a, b, c).as_i64x8(); transmute(simd_select_bitmask(k, shf, a.as_i64x8())) } @@ -345,12 +340,7 @@ pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvq))] -pub unsafe fn _mm256_mask_shldv_epi64( - a: __m256i, - k: __mmask8, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shldv_epi64(a, b, c).as_i64x4(); transmute(simd_select_bitmask(k, shf, a.as_i64x4())) } @@ -383,12 +373,7 @@ pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvq))] -pub unsafe fn _mm_mask_shldv_epi64( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shldv_epi64(a, b, c).as_i64x2(); transmute(simd_select_bitmask(k, shf, a.as_i64x2())) } @@ -421,12 +406,7 @@ pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshldvd))] -pub unsafe fn _mm512_mask_shldv_epi32( - a: __m512i, - k: __mmask16, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shldv_epi32(a, b, c).as_i32x16(); transmute(simd_select_bitmask(k, shf, a.as_i32x16())) } @@ -437,7 +417,12 @@ pub unsafe fn _mm512_mask_shldv_epi32( #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshldvd))] -pub unsafe fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i { +pub unsafe fn _mm512_maskz_shldv_epi32( + k: __mmask16, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { let shf = _mm512_shldv_epi32(a, b, c).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); transmute(simd_select_bitmask(k, shf, zero)) @@ -459,12 +444,7 @@ pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvd))] -pub unsafe fn _mm256_mask_shldv_epi32( - a: __m256i, - k: __mmask8, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shldv_epi32(a, b, c).as_i32x8(); transmute(simd_select_bitmask(k, shf, a.as_i32x8())) } @@ -497,12 +477,7 @@ pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvd))] -pub unsafe fn _mm_mask_shldv_epi32( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shldv_epi32(a, b, c).as_i32x4(); transmute(simd_select_bitmask(k, shf, a.as_i32x4())) } @@ -535,12 +510,7 @@ pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshldvw))] -pub unsafe fn _mm512_mask_shldv_epi16( - a: __m512i, - k: __mmask32, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shldv_epi16(a, b, c).as_i16x32(); transmute(simd_select_bitmask(k, shf, a.as_i16x32())) } @@ -551,7 +521,12 @@ pub unsafe fn _mm512_mask_shldv_epi16( #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshldvw))] -pub unsafe fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i { +pub unsafe fn _mm512_maskz_shldv_epi16( + k: __mmask32, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { let shf = _mm512_shldv_epi16(a, b, c).as_i16x32(); let zero = _mm512_setzero_si512().as_i16x32(); transmute(simd_select_bitmask(k, shf, zero)) @@ -573,12 +548,7 @@ pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvw))] -pub unsafe fn _mm256_mask_shldv_epi16( - a: __m256i, - k: __mmask16, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shldv_epi16(a, b, c).as_i16x16(); transmute(simd_select_bitmask(k, shf, a.as_i16x16())) } @@ -589,7 +559,12 @@ pub unsafe fn _mm256_mask_shldv_epi16( #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvw))] -pub unsafe fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i { +pub unsafe fn _mm256_maskz_shldv_epi16( + k: __mmask16, + a: __m256i, + b: __m256i, + c: __m256i, +) -> __m256i { let shf = _mm256_shldv_epi16(a, b, c).as_i16x16(); let zero = _mm256_setzero_si256().as_i16x16(); transmute(simd_select_bitmask(k, shf, zero)) @@ -611,12 +586,7 @@ pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshldvw))] -pub unsafe fn _mm_mask_shldv_epi16( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shldv_epi16(a, b, c).as_i16x8(); transmute(simd_select_bitmask(k, shf, a.as_i16x8())) } @@ -649,12 +619,7 @@ pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshrdvq))] -pub unsafe fn _mm512_mask_shrdv_epi64( - a: __m512i, - k: __mmask8, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8(); transmute(simd_select_bitmask(k, shf, a.as_i64x8())) } @@ -687,12 +652,7 @@ pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvq))] -pub unsafe fn _mm256_mask_shrdv_epi64( - a: __m256i, - k: __mmask8, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4(); transmute(simd_select_bitmask(k, shf, a.as_i64x4())) } @@ -725,12 +685,7 @@ pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvq))] -pub unsafe fn _mm_mask_shrdv_epi64( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shrdv_epi64(a, b, c).as_i64x2(); transmute(simd_select_bitmask(k, shf, a.as_i64x2())) } @@ -763,12 +718,7 @@ pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshrdvd))] -pub unsafe fn _mm512_mask_shrdv_epi32( - a: __m512i, - k: __mmask16, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16(); transmute(simd_select_bitmask(k, shf, a.as_i32x16())) } @@ -779,7 +729,12 @@ pub unsafe fn _mm512_mask_shrdv_epi32( #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshrdvd))] -pub unsafe fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i { +pub unsafe fn _mm512_maskz_shrdv_epi32( + k: __mmask16, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); transmute(simd_select_bitmask(k, shf, zero)) @@ -801,12 +756,7 @@ pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvd))] -pub unsafe fn _mm256_mask_shrdv_epi32( - a: __m256i, - k: __mmask8, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8(); transmute(simd_select_bitmask(k, shf, a.as_i32x8())) } @@ -839,12 +789,7 @@ pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvd))] -pub unsafe fn _mm_mask_shrdv_epi32( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shrdv_epi32(a, b, c).as_i32x4(); transmute(simd_select_bitmask(k, shf, a.as_i32x4())) } @@ -877,12 +822,7 @@ pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshrdvw))] -pub unsafe fn _mm512_mask_shrdv_epi16( - a: __m512i, - k: __mmask32, - b: __m512i, - c: __m512i, -) -> __m512i { +pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i { let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32(); transmute(simd_select_bitmask(k, shf, a.as_i16x32())) } @@ -893,7 +833,12 @@ pub unsafe fn _mm512_mask_shrdv_epi16( #[inline] #[target_feature(enable = "avx512vbmi2")] #[cfg_attr(test, assert_instr(vpshrdvw))] -pub unsafe fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i { +pub unsafe fn _mm512_maskz_shrdv_epi16( + k: __mmask32, + a: __m512i, + b: __m512i, + c: __m512i, +) -> __m512i { let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32(); let zero = _mm512_setzero_si512().as_i16x32(); transmute(simd_select_bitmask(k, shf, zero)) @@ -915,12 +860,7 @@ pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvw))] -pub unsafe fn _mm256_mask_shrdv_epi16( - a: __m256i, - k: __mmask16, - b: __m256i, - c: __m256i, -) -> __m256i { +pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i { let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16(); transmute(simd_select_bitmask(k, shf, a.as_i16x16())) } @@ -931,7 +871,12 @@ pub unsafe fn _mm256_mask_shrdv_epi16( #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvw))] -pub unsafe fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i { +pub unsafe fn _mm256_maskz_shrdv_epi16( + k: __mmask16, + a: __m256i, + b: __m256i, + c: __m256i, +) -> __m256i { let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16(); let zero = _mm256_setzero_si256().as_i16x16(); transmute(simd_select_bitmask(k, shf, zero)) @@ -953,12 +898,7 @@ pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i { #[inline] #[target_feature(enable = "avx512vbmi2,avx512vl")] #[cfg_attr(test, assert_instr(vpshrdvw))] -pub unsafe fn _mm_mask_shrdv_epi16( - a: __m128i, - k: __mmask8, - b: __m128i, - c: __m128i, -) -> __m128i { +pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { let shf = _mm_shrdv_epi16(a, b, c).as_i16x8(); transmute(simd_select_bitmask(k, shf, a.as_i16x8())) } @@ -1366,7 +1306,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_shldv_epi64() { let a = _mm512_set1_epi64(1); - let b = _mm512_set1_epi64(1<<63); + let b = _mm512_set1_epi64(1 << 63); let c = _mm512_set1_epi64(2); let r = _mm512_shldv_epi64(a, b, c); let e = _mm512_set1_epi64(6); @@ -1376,7 +1316,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_mask_shldv_epi64() { let a = _mm512_set1_epi64(1); - let b = _mm512_set1_epi64(1<<63); + let b = _mm512_set1_epi64(1 << 63); let c = _mm512_set1_epi64(2); let r = _mm512_mask_shldv_epi64(a, 0, b, c); assert_eq_m512i(r, a); @@ -1388,7 +1328,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_maskz_shldv_epi64() { let a = _mm512_set1_epi64(1); - let b = _mm512_set1_epi64(1<<63); + let b = _mm512_set1_epi64(1 << 63); let c = _mm512_set1_epi64(2); let r = _mm512_maskz_shldv_epi64(0, a, b, c); assert_eq_m512i(r, _mm512_setzero_si512()); @@ -1400,7 +1340,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_shldv_epi64() { let a = _mm256_set1_epi64x(1); - let b = _mm256_set1_epi64x(1<<63); + let b = _mm256_set1_epi64x(1 << 63); let c = _mm256_set1_epi64x(2); let r = _mm256_shldv_epi64(a, b, c); let e = _mm256_set1_epi64x(6); @@ -1410,7 +1350,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_mask_shldv_epi64() { let a = _mm256_set1_epi64x(1); - let b = _mm256_set1_epi64x(1<<63); + let b = _mm256_set1_epi64x(1 << 63); let c = _mm256_set1_epi64x(2); let r = _mm256_mask_shldv_epi64(a, 0, b, c); assert_eq_m256i(r, a); @@ -1422,7 +1362,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_maskz_shldv_epi64() { let a = _mm256_set1_epi64x(1); - let b = _mm256_set1_epi64x(1<<63); + let b = _mm256_set1_epi64x(1 << 63); let c = _mm256_set1_epi64x(2); let r = _mm256_maskz_shldv_epi64(0, a, b, c); assert_eq_m256i(r, _mm256_setzero_si256()); @@ -1434,7 +1374,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_shldv_epi64() { let a = _mm_set1_epi64x(1); - let b = _mm_set1_epi64x(1<<63); + let b = _mm_set1_epi64x(1 << 63); let c = _mm_set1_epi64x(2); let r = _mm_shldv_epi64(a, b, c); let e = _mm_set1_epi64x(6); @@ -1444,7 +1384,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_mask_shldv_epi64() { let a = _mm_set1_epi64x(1); - let b = _mm_set1_epi64x(1<<63); + let b = _mm_set1_epi64x(1 << 63); let c = _mm_set1_epi64x(2); let r = _mm_mask_shldv_epi64(a, 0, b, c); assert_eq_m128i(r, a); @@ -1456,7 +1396,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_maskz_shldv_epi64() { let a = _mm_set1_epi64x(1); - let b = _mm_set1_epi64x(1<<63); + let b = _mm_set1_epi64x(1 << 63); let c = _mm_set1_epi64x(2); let r = _mm_maskz_shldv_epi64(0, a, b, c); assert_eq_m128i(r, _mm_setzero_si128()); @@ -1468,7 +1408,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_shldv_epi32() { let a = _mm512_set1_epi32(1); - let b = _mm512_set1_epi32(1<<31); + let b = _mm512_set1_epi32(1 << 31); let c = _mm512_set1_epi32(2); let r = _mm512_shldv_epi32(a, b, c); let e = _mm512_set1_epi32(6); @@ -1478,7 +1418,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_mask_shldv_epi32() { let a = _mm512_set1_epi32(1); - let b = _mm512_set1_epi32(1<<31); + let b = _mm512_set1_epi32(1 << 31); let c = _mm512_set1_epi32(2); let r = _mm512_mask_shldv_epi32(a, 0, b, c); assert_eq_m512i(r, a); @@ -1490,7 +1430,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_maskz_shldv_epi32() { let a = _mm512_set1_epi32(1); - let b = _mm512_set1_epi32(1<<31); + let b = _mm512_set1_epi32(1 << 31); let c = _mm512_set1_epi32(2); let r = _mm512_maskz_shldv_epi32(0, a, b, c); assert_eq_m512i(r, _mm512_setzero_si512()); @@ -1502,7 +1442,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_shldv_epi32() { let a = _mm256_set1_epi32(1); - let b = _mm256_set1_epi32(1<<31); + let b = _mm256_set1_epi32(1 << 31); let c = _mm256_set1_epi32(2); let r = _mm256_shldv_epi32(a, b, c); let e = _mm256_set1_epi32(6); @@ -1512,7 +1452,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_mask_shldv_epi32() { let a = _mm256_set1_epi32(1); - let b = _mm256_set1_epi32(1<<31); + let b = _mm256_set1_epi32(1 << 31); let c = _mm256_set1_epi32(2); let r = _mm256_mask_shldv_epi32(a, 0, b, c); assert_eq_m256i(r, a); @@ -1524,7 +1464,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_maskz_shldv_epi32() { let a = _mm256_set1_epi32(1); - let b = _mm256_set1_epi32(1<<31); + let b = _mm256_set1_epi32(1 << 31); let c = _mm256_set1_epi32(2); let r = _mm256_maskz_shldv_epi32(0, a, b, c); assert_eq_m256i(r, _mm256_setzero_si256()); @@ -1536,7 +1476,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_shldv_epi32() { let a = _mm_set1_epi32(1); - let b = _mm_set1_epi32(1<<31); + let b = _mm_set1_epi32(1 << 31); let c = _mm_set1_epi32(2); let r = _mm_shldv_epi32(a, b, c); let e = _mm_set1_epi32(6); @@ -1546,7 +1486,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_mask_shldv_epi32() { let a = _mm_set1_epi32(1); - let b = _mm_set1_epi32(1<<31); + let b = _mm_set1_epi32(1 << 31); let c = _mm_set1_epi32(2); let r = _mm_mask_shldv_epi32(a, 0, b, c); assert_eq_m128i(r, a); @@ -1558,7 +1498,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_maskz_shldv_epi32() { let a = _mm_set1_epi32(1); - let b = _mm_set1_epi32(1<<31); + let b = _mm_set1_epi32(1 << 31); let c = _mm_set1_epi32(2); let r = _mm_maskz_shldv_epi32(0, a, b, c); assert_eq_m128i(r, _mm_setzero_si128()); @@ -1570,7 +1510,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_shldv_epi16() { let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(1<<15); + let b = _mm512_set1_epi16(1 << 15); let c = _mm512_set1_epi16(2); let r = _mm512_shldv_epi16(a, b, c); let e = _mm512_set1_epi16(6); @@ -1580,7 +1520,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_mask_shldv_epi16() { let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(1<<15); + let b = _mm512_set1_epi16(1 << 15); let c = _mm512_set1_epi16(2); let r = _mm512_mask_shldv_epi16(a, 0, b, c); assert_eq_m512i(r, a); @@ -1592,7 +1532,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2")] unsafe fn test_mm512_maskz_shldv_epi16() { let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(1<<15); + let b = _mm512_set1_epi16(1 << 15); let c = _mm512_set1_epi16(2); let r = _mm512_maskz_shldv_epi16(0, a, b, c); assert_eq_m512i(r, _mm512_setzero_si512()); @@ -1604,7 +1544,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_shldv_epi16() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(1<<15); + let b = _mm256_set1_epi16(1 << 15); let c = _mm256_set1_epi16(2); let r = _mm256_shldv_epi16(a, b, c); let e = _mm256_set1_epi16(6); @@ -1614,7 +1554,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_mask_shldv_epi16() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(1<<15); + let b = _mm256_set1_epi16(1 << 15); let c = _mm256_set1_epi16(2); let r = _mm256_mask_shldv_epi16(a, 0, b, c); assert_eq_m256i(r, a); @@ -1626,7 +1566,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm256_maskz_shldv_epi16() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(1<<15); + let b = _mm256_set1_epi16(1 << 15); let c = _mm256_set1_epi16(2); let r = _mm256_maskz_shldv_epi16(0, a, b, c); assert_eq_m256i(r, _mm256_setzero_si256()); @@ -1638,7 +1578,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_shldv_epi16() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(1<<15); + let b = _mm_set1_epi16(1 << 15); let c = _mm_set1_epi16(2); let r = _mm_shldv_epi16(a, b, c); let e = _mm_set1_epi16(6); @@ -1648,7 +1588,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_mask_shldv_epi16() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(1<<15); + let b = _mm_set1_epi16(1 << 15); let c = _mm_set1_epi16(2); let r = _mm_mask_shldv_epi16(a, 0, b, c); assert_eq_m128i(r, a); @@ -1660,7 +1600,7 @@ mod tests { #[simd_test(enable = "avx512vbmi2,avx512vl")] unsafe fn test_mm_maskz_shldv_epi16() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(1<<15); + let b = _mm_set1_epi16(1 << 15); let c = _mm_set1_epi16(2); let r = _mm_maskz_shldv_epi16(0, a, b, c); assert_eq_m128i(r, _mm_setzero_si128()); @@ -1668,4 +1608,310 @@ mod tests { let e = _mm_set1_epi16(6); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_shrdv_epi64(a, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_shrdv_epi64(a, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_shrdv_epi64(a, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_shrdv_epi32(a, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_shrdv_epi32(a, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_shrdv_epi32(a, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_shrdv_epi16(a, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_shrdv_epi16(a, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_shrdv_epi16(a, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } } From cf7064d3e73dc618d2eb8cdeaaa3290ad2245474 Mon Sep 17 00:00:00 2001 From: jirong Date: Sat, 9 Jan 2021 20:51:16 +0000 Subject: [PATCH 06/10] shldi_epi64,epi32,epi16: mm512,mm256,mm --- crates/core_arch/src/x86/avx512vbmi2.rs | 1087 +++++++++++++++++++++++ 1 file changed, 1087 insertions(+) diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index 94ce39496b..78091daa91 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -915,6 +915,507 @@ pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1 transmute(simd_select_bitmask(k, shf, zero)) } +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + )) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shldi_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x8 = vpshldvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x8 = vpshldvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + ); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + )) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shldi_epi64( + src: __m256i, + k: __mmask8, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x4 = vpshldvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x4 = vpshldvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + ); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + )) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shldi_epi64( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x2 = vpshldvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x2 = vpshldvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + ); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + )) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shldi_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x16 = vpshldvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x16 = vpshldvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + ); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + )) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shldi_epi32( + src: __m256i, + k: __mmask8, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x8 = vpshldvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x8 = vpshldvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + ); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvd128( + a.as_i32x4(), + b.as_i32x4(), + _mm_set1_epi32(imm8).as_i32x4(), + )) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shldi_epi32( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4()); + transmute(simd_select_bitmask(k, shf, src.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4()); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + )) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shldi_epi16( + src: __m512i, + k: __mmask32, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x32 = vpshldvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x32 = vpshldvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + ); + let zero = _mm512_setzero_si512().as_i16x32(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshldvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + )) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shldi_epi16( + src: __m256i, + k: __mmask16, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x16 = vpshldvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + let shf: i16x16 = vpshldvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + ); + let zero = _mm256_setzero_si256().as_i16x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + transmute(vpshldvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + )) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shldi_epi16( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + let shf: i16x8 = vpshldvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let shf: i16x8 = vpshldvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + ); + let zero = _mm_setzero_si128().as_i16x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.w.512"] @@ -1914,4 +2415,590 @@ mod tests { let e = _mm_set1_epi16(1); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldi_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1 << 63); + let r = _mm512_shldi_epi64(a, b, 2); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldi_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1 << 63); + let r = _mm512_mask_shldi_epi64(a, 0, a, b, 2); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldi_epi64(a, 0b11111111, a, b, 2); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldi_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(1 << 63); + let r = _mm512_maskz_shldi_epi64(0, a, b, 2); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldi_epi64(0b11111111, a, b, 2); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldi_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1 << 63); + let r = _mm256_shldi_epi64(a, b, 2); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldi_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1 << 63); + let r = _mm256_mask_shldi_epi64(a, 0, a, b, 2); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldi_epi64(a, 0b00001111, a, b, 2); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldi_epi64() { + let a = _mm256_set1_epi64x(1); + let b = _mm256_set1_epi64x(1 << 63); + let r = _mm256_maskz_shldi_epi64(0, a, b, 2); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldi_epi64(0b00001111, a, b, 2); + let e = _mm256_set1_epi64x(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldi_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1 << 63); + let r = _mm_shldi_epi64(a, b, 2); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldi_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1 << 63); + let r = _mm_mask_shldi_epi64(a, 0, a, b, 2); + assert_eq_m128i(r, a); + let r = _mm_mask_shldi_epi64(a, 0b00000011, a, b, 2); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldi_epi64() { + let a = _mm_set1_epi64x(1); + let b = _mm_set1_epi64x(1 << 63); + let r = _mm_maskz_shldi_epi64(0, a, b, 2); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldi_epi64(0b00000011, a, b, 2); + let e = _mm_set1_epi64x(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldi_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1 << 31); + let r = _mm512_shldi_epi32(a, b, 2); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldi_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1 << 31); + let r = _mm512_mask_shldi_epi32(a, 0, a, b, 2); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldi_epi32(a, 0b11111111_11111111, a, b, 2); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldi_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1 << 31); + let r = _mm512_maskz_shldi_epi32(0, a, b, 2); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldi_epi32(0b11111111_11111111, a, b, 2); + let e = _mm512_set1_epi32(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldi_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1 << 31); + let r = _mm256_shldi_epi32(a, b, 2); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldi_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1 << 31); + let r = _mm256_mask_shldi_epi32(a, 0, a, b, 2); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldi_epi32(a, 0b11111111, a, b, 2); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldi_epi32() { + let a = _mm256_set1_epi32(1); + let b = _mm256_set1_epi32(1 << 31); + let r = _mm256_maskz_shldi_epi32(0, a, b, 2); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldi_epi32(0b11111111, a, b, 2); + let e = _mm256_set1_epi32(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldi_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1 << 31); + let r = _mm_shldi_epi32(a, b, 2); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldi_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1 << 31); + let r = _mm_mask_shldi_epi32(a, 0, a, b, 2); + assert_eq_m128i(r, a); + let r = _mm_mask_shldi_epi32(a, 0b00001111, a, b, 2); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldi_epi32() { + let a = _mm_set1_epi32(1); + let b = _mm_set1_epi32(1 << 31); + let r = _mm_maskz_shldi_epi32(0, a, b, 2); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldi_epi32(0b00001111, a, b, 2); + let e = _mm_set1_epi32(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shldi_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1 << 15); + let r = _mm512_shldi_epi16(a, b, 2); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shldi_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1 << 15); + let r = _mm512_mask_shldi_epi16(a, 0, a, b, 2); + assert_eq_m512i(r, a); + let r = _mm512_mask_shldi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 2); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shldi_epi16() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1 << 15); + let r = _mm512_maskz_shldi_epi16(0, a, b, 2); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shldi_epi16(0b11111111_11111111_11111111_11111111, a, b, 2); + let e = _mm512_set1_epi16(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shldi_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1 << 15); + let r = _mm256_shldi_epi16(a, b, 2); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shldi_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1 << 15); + let r = _mm256_mask_shldi_epi16(a, 0, a, b, 2); + assert_eq_m256i(r, a); + let r = _mm256_mask_shldi_epi16(a, 0b11111111_11111111, a, b, 2); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shldi_epi16() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1 << 15); + let r = _mm256_maskz_shldi_epi16(0, a, b, 2); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shldi_epi16(0b11111111_11111111, a, b, 2); + let e = _mm256_set1_epi16(6); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shldi_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1 << 15); + let r = _mm_shldi_epi16(a, b, 2); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shldi_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1 << 15); + let r = _mm_mask_shldi_epi16(a, 0, a, b, 2); + assert_eq_m128i(r, a); + let r = _mm_mask_shldi_epi16(a, 0b11111111, a, b, 2); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shldi_epi16() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1 << 15); + let r = _mm_maskz_shldi_epi16(0, a, b, 2); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shldi_epi16(0b11111111, a, b, 2); + let e = _mm_set1_epi16(6); + assert_eq_m128i(r, e); + } + /* + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_shrdv_epi64(a, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let c = _mm512_set1_epi64(1); + let r = _mm512_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_shrdv_epi64(a, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let c = _mm256_set1_epi64x(1); + let r = _mm256_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_shrdv_epi64(a, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_mask_shrdv_epi64(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let c = _mm_set1_epi64x(1); + let r = _mm_maskz_shrdv_epi64(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_shrdv_epi32(a, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let c = _mm512_set1_epi32(1); + let r = _mm512_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_shrdv_epi32(a, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let c = _mm256_set1_epi32(1); + let r = _mm256_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_shrdv_epi32(a, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_mask_shrdv_epi32(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let c = _mm_set1_epi32(1); + let r = _mm_maskz_shrdv_epi32(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_shrdv_epi16(a, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdv_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let c = _mm512_set1_epi16(1); + let r = _mm512_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_shrdv_epi16(a, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdv_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let c = _mm256_set1_epi16(1); + let r = _mm256_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_shrdv_epi16(a, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_mask_shrdv_epi16(a, 0, b, c); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdv_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let c = _mm_set1_epi16(1); + let r = _mm_maskz_shrdv_epi16(0, a, b, c); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + */ } From 2ee39f3368e1c3c937289c6ec1c99a210410af04 Mon Sep 17 00:00:00 2001 From: jirong Date: Sun, 10 Jan 2021 00:15:50 +0000 Subject: [PATCH 07/10] shrdi_epi64,epi32,epi16: mm512,mm256,mm --- crates/core_arch/avx512vbmi2.md | 154 ++++ crates/core_arch/src/x86/avx512vbmi2.rs | 1087 ++++++++++++++++------- 2 files changed, 934 insertions(+), 307 deletions(-) create mode 100644 crates/core_arch/avx512vbmi2.md diff --git a/crates/core_arch/avx512vbmi2.md b/crates/core_arch/avx512vbmi2.md new file mode 100644 index 0000000000..43d497108e --- /dev/null +++ b/crates/core_arch/avx512vbmi2.md @@ -0,0 +1,154 @@ +["AVX512_VBMI2"]

+ + * [x] [`_mm_mask_compress_epi16`] + * [x] [`_mm_maskz_compress_epi16`] + * [x] [`_mm256_mask_compress_epi16`] + * [x] [`_mm256_maskz_compress_epi16`] + * [x] [`_mm512_mask_compress_epi16`] + * [x] [`_mm512_maskz_compress_epi16`] + * [x] [`_mm_mask_compress_epi8`] + * [x] [`_mm_maskz_compress_epi8`] + * [x] [`_mm256_mask_compress_epi8`] + * [x] [`_mm256_maskz_compress_epi8`] + * [x] [`_mm512_mask_compress_epi8`] + * [x] [`_mm512_maskz_compress_epi8`] + * [_] [`_mm_mask_compressstoreu_epi16`] + * [_] [`_mm256_mask_compressstoreu_epi16`] + * [_] [`_mm512_mask_compressstoreu_epi16`] + * [_] [`_mm_mask_compressstoreu_epi8`] + * [_] [`_mm256_mask_compressstoreu_epi8`] + * [_] [`_mm512_mask_compressstoreu_epi8`] + * [x] [`_mm_mask_expand_epi16`] + * [x] [`_mm_maskz_expand_epi16`] + * [x] [`_mm256_mask_expand_epi16`] + * [x] [`_mm256_maskz_expand_epi16`] + * [x] [`_mm512_mask_expand_epi16`] + * [x] [`_mm512_maskz_expand_epi16`] + * [x] [`_mm_mask_expand_epi8`] + * [x] [`_mm_maskz_expand_epi8`] + * [x] [`_mm256_mask_expand_epi8`] + * [x] [`_mm256_maskz_expand_epi8`] + * [x] [`_mm512_mask_expand_epi8`] + * [x] [`_mm512_maskz_expand_epi8`] + * [_] [`_mm_mask_expandloadu_epi16`] + * [_] [`_mm_maskz_expandloadu_epi16`] + * [_] [`_mm256_mask_expandloadu_epi16`] + * [_] [`_mm256_maskz_expandloadu_epi16`] + * [_] [`_mm512_mask_expandloadu_epi16`] + * [_] [`_mm512_maskz_expandloadu_epi16`] + * [_] [`_mm_mask_expandloadu_epi8`] + * [_] [`_mm_maskz_expandloadu_epi8`] + * [_] [`_mm256_mask_expandloadu_epi8`] + * [_] [`_mm256_maskz_expandloadu_epi8`] + * [_] [`_mm512_mask_expandloadu_epi8`] + * [_] [`_mm512_maskz_expandloadu_epi8`] + * [x] [`_mm_mask_shldi_epi16`] + * [x] [`_mm_maskz_shldi_epi16`] + * [x] [`_mm_shldi_epi16`] + * [x] [`_mm256_mask_shldi_epi16`] + * [x] [`_mm256_maskz_shldi_epi16`] + * [x] [`_mm256_shldi_epi16`] + * [x] [`_mm512_mask_shldi_epi16`] + * [x] [`_mm512_maskz_shldi_epi16`] + * [x] [`_mm512_shldi_epi16`] + * [x] [`_mm_mask_shldi_epi32`] + * [x] [`_mm_maskz_shldi_epi32`] + * [x] [`_mm_shldi_epi32`] + * [x] [`_mm256_mask_shldi_epi32`] + * [x] [`_mm256_maskz_shldi_epi32`] + * [x] [`_mm256_shldi_epi32`] + * [x] [`_mm512_mask_shldi_epi32`] + * [x] [`_mm512_maskz_shldi_epi32`] + * [x] [`_mm512_shldi_epi32`] + * [x] [`_mm_mask_shldi_epi64`] + * [x] [`_mm_maskz_shldi_epi64`] + * [x] [`_mm_shldi_epi64`] + * [x] [`_mm256_mask_shldi_epi64`] + * [x] [`_mm256_maskz_shldi_epi64`] + * [x] [`_mm256_shldi_epi64`] + * [x] [`_mm512_mask_shldi_epi64`] + * [x] [`_mm512_maskz_shldi_epi64`] + * [x] [`_mm512_shldi_epi64`] + * [x] [`_mm_mask_shldv_epi16`] + * [x] [`_mm_maskz_shldv_epi16`] + * [x] [`_mm_shldv_epi16`] + * [x] [`_mm256_mask_shldv_epi16`] + * [x] [`_mm256_maskz_shldv_epi16`] + * [x] [`_mm256_shldv_epi16`] + * [x] [`_mm512_mask_shldv_epi16`] + * [x] [`_mm512_maskz_shldv_epi16`] + * [x] [`_mm512_shldv_epi16`] + * [x] [`_mm_mask_shldv_epi32`] + * [x] [`_mm_maskz_shldv_epi32`] + * [x] [`_mm_shldv_epi32`] + * [x] [`_mm256_mask_shldv_epi32`] + * [x] [`_mm256_maskz_shldv_epi32`] + * [x] [`_mm256_shldv_epi32`] + * [x] [`_mm512_mask_shldv_epi32`] + * [x] [`_mm512_maskz_shldv_epi32`] + * [x] [`_mm512_shldv_epi32`] + * [x] [`_mm_mask_shldv_epi64`] + * [x] [`_mm_maskz_shldv_epi64`] + * [x] [`_mm_shldv_epi64`] + * [x] [`_mm256_mask_shldv_epi64`] + * [x] [`_mm256_maskz_shldv_epi64`] + * [x] [`_mm256_shldv_epi64`] + * [x] [`_mm512_mask_shldv_epi64`] + * [x] [`_mm512_maskz_shldv_epi64`] + * [x] [`_mm512_shldv_epi64`] + * [x] [`_mm_mask_shrdi_epi16`] + * [x] [`_mm_maskz_shrdi_epi16`] + * [x] [`_mm_shrdi_epi16`] + * [x] [`_mm256_mask_shrdi_epi16`] + * [x] [`_mm256_maskz_shrdi_epi16`] + * [x] [`_mm256_shrdi_epi16`] + * [x] [`_mm512_mask_shrdi_epi16`] + * [x] [`_mm512_maskz_shrdi_epi16`] + * [x] [`_mm512_shrdi_epi16`] + * [x] [`_mm_mask_shrdi_epi32`] + * [x] [`_mm_maskz_shrdi_epi32`] + * [x] [`_mm_shrdi_epi32`] + * [x] [`_mm256_mask_shrdi_epi32`] + * [x] [`_mm256_maskz_shrdi_epi32`] + * [x] [`_mm256_shrdi_epi32`] + * [x] [`_mm512_mask_shrdi_epi32`] + * [x] [`_mm512_maskz_shrdi_epi32`] + * [x] [`_mm512_shrdi_epi32`] + * [x] [`_mm_mask_shrdi_epi64`] + * [x] [`_mm_maskz_shrdi_epi64`] + * [x] [`_mm_shrdi_epi64`] + * [x] [`_mm256_mask_shrdi_epi64`] + * [x] [`_mm256_maskz_shrdi_epi64`] + * [x] [`_mm256_shrdi_epi64`] + * [x] [`_mm512_mask_shrdi_epi64`] + * [x] [`_mm512_maskz_shrdi_epi64`] + * [x] [`_mm512_shrdi_epi64`] + * [x] [`_mm_mask_shrdv_epi16`] + * [x] [`_mm_maskz_shrdv_epi16`] + * [x] [`_mm_shrdv_epi16`] + * [x] [`_mm256_mask_shrdv_epi16`] + * [x] [`_mm256_maskz_shrdv_epi16`] + * [x] [`_mm256_shrdv_epi16`] + * [x] [`_mm512_mask_shrdv_epi16`] + * [x] [`_mm512_maskz_shrdv_epi16`] + * [x] [`_mm512_shrdv_epi16`] + * [x] [`_mm_mask_shrdv_epi32`] + * [x] [`_mm_maskz_shrdv_epi32`] + * [x] [`_mm_shrdv_epi32`] + * [x] [`_mm256_mask_shrdv_epi32`] + * [x] [`_mm256_maskz_shrdv_epi32`] + * [x] [`_mm256_shrdv_epi32`] + * [x] [`_mm512_mask_shrdv_epi32`] + * [x] [`_mm512_maskz_shrdv_epi32`] + * [x] [`_mm512_shrdv_epi32`] + * [x] [`_mm_mask_shrdv_epi64`] + * [x] [`_mm_maskz_shrdv_epi64`] + * [x] [`_mm_shrdv_epi64`] + * [x] [`_mm256_mask_shrdv_epi64`] + * [x] [`_mm256_maskz_shrdv_epi64`] + * [x] [`_mm256_shrdv_epi64`] + * [x] [`_mm512_mask_shrdv_epi64`] + * [x] [`_mm512_maskz_shrdv_epi64`] + * [x] [`_mm512_shrdv_epi64`] + +

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs index 78091daa91..032bce9176 100644 --- a/crates/core_arch/src/x86/avx512vbmi2.rs +++ b/crates/core_arch/src/x86/avx512vbmi2.rs @@ -1416,6 +1416,507 @@ pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i transmute(simd_select_bitmask(k, shf, zero)) } +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + )) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shrdi_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x8 = vpshrdvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x8())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 255))] //should be vpshrdq +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x8 = vpshrdvq( + a.as_i64x8(), + b.as_i64x8(), + _mm512_set1_epi64(imm8 as i64).as_i64x8(), + ); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + )) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shrdi_epi64( + src: __m256i, + k: __mmask8, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x4 = vpshrdvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x4())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x4 = vpshrdvq256( + a.as_i64x4(), + b.as_i64x4(), + _mm256_set1_epi64x(imm8 as i64).as_i64x4(), + ); + let zero = _mm256_setzero_si256().as_i64x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + )) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shrdi_epi64( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x2 = vpshrdvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i64x2())) +} + +/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i64x2 = vpshrdvq128( + a.as_i64x2(), + b.as_i64x2(), + _mm_set1_epi64x(imm8 as i64).as_i64x2(), + ); + let zero = _mm_setzero_si128().as_i64x2(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + )) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shrdi_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x16 = vpshrdvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i32x16())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x16 = vpshrdvd( + a.as_i32x16(), + b.as_i32x16(), + _mm512_set1_epi32(imm8).as_i32x16(), + ); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + )) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shrdi_epi32( + src: __m256i, + k: __mmask8, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x8 = vpshrdvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i32x8())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x8 = vpshrdvd256( + a.as_i32x8(), + b.as_i32x8(), + _mm256_set1_epi32(imm8).as_i32x8(), + ); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvd128( + a.as_i32x4(), + b.as_i32x4(), + _mm_set1_epi32(imm8).as_i32x4(), + )) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shrdi_epi32( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4()); + transmute(simd_select_bitmask(k, shf, src.as_i32x4())) +} + +/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4()); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + )) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shrdi_epi16( + src: __m512i, + k: __mmask32, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x32 = vpshrdvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x32())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095) +#[inline] +#[target_feature(enable = "avx512vbmi2")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x32 = vpshrdvw( + a.as_i16x32(), + b.as_i16x32(), + _mm512_set1_epi16(imm8 as i16).as_i16x32(), + ); + let zero = _mm512_setzero_si512().as_i16x32(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(2)] +pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + transmute(vpshrdvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + )) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(4)] +pub unsafe fn _mm256_mask_shrdi_epi16( + src: __m256i, + k: __mmask16, + a: __m256i, + b: __m256i, + imm8: i32, +) -> __m256i { + assert!(imm8 >= 0 && imm8 <= 255); + let shf: i16x16 = vpshrdvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x16())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(3)] +pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i { + let shf: i16x16 = vpshrdvw256( + a.as_i16x16(), + b.as_i16x16(), + _mm256_set1_epi16(imm8 as i16).as_i16x16(), + ); + let zero = _mm256_setzero_si256().as_i16x16(); + transmute(simd_select_bitmask(k, shf, zero)) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(2)] +pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + transmute(vpshrdvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + )) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_shrdi_epi16( + src: __m128i, + k: __mmask8, + a: __m128i, + b: __m128i, + imm8: i32, +) -> __m128i { + let shf: i16x8 = vpshrdvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + ); + transmute(simd_select_bitmask(k, shf, src.as_i16x8())) +} + +/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089) +#[inline] +#[target_feature(enable = "avx512vbmi2,avx512vl")] +#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_shrdi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let shf: i16x8 = vpshrdvw128( + a.as_i16x8(), + b.as_i16x8(), + _mm_set1_epi16(imm8 as i16).as_i16x8(), + ); + let zero = _mm_setzero_si128().as_i16x8(); + transmute(simd_select_bitmask(k, shf, zero)) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.mask.compress.w.512"] @@ -2694,311 +3195,283 @@ mod tests { let e = _mm_set1_epi16(6); assert_eq_m128i(r, e); } - /* - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_shrdv_epi64() { - let a = _mm512_set1_epi64(8); - let b = _mm512_set1_epi64(2); - let c = _mm512_set1_epi64(1); - let r = _mm512_shrdv_epi64(a, b, c); - let e = _mm512_set1_epi64(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_mask_shrdv_epi64() { - let a = _mm512_set1_epi64(8); - let b = _mm512_set1_epi64(2); - let c = _mm512_set1_epi64(1); - let r = _mm512_mask_shrdv_epi64(a, 0, b, c); - assert_eq_m512i(r, a); - let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c); - let e = _mm512_set1_epi64(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_maskz_shrdv_epi64() { - let a = _mm512_set1_epi64(8); - let b = _mm512_set1_epi64(2); - let c = _mm512_set1_epi64(1); - let r = _mm512_maskz_shrdv_epi64(0, a, b, c); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c); - let e = _mm512_set1_epi64(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_shrdv_epi64() { - let a = _mm256_set1_epi64x(8); - let b = _mm256_set1_epi64x(2); - let c = _mm256_set1_epi64x(1); - let r = _mm256_shrdv_epi64(a, b, c); - let e = _mm256_set1_epi64x(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_mask_shrdv_epi64() { - let a = _mm256_set1_epi64x(8); - let b = _mm256_set1_epi64x(2); - let c = _mm256_set1_epi64x(1); - let r = _mm256_mask_shrdv_epi64(a, 0, b, c); - assert_eq_m256i(r, a); - let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c); - let e = _mm256_set1_epi64x(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_maskz_shrdv_epi64() { - let a = _mm256_set1_epi64x(8); - let b = _mm256_set1_epi64x(2); - let c = _mm256_set1_epi64x(1); - let r = _mm256_maskz_shrdv_epi64(0, a, b, c); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c); - let e = _mm256_set1_epi64x(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_shrdv_epi64() { - let a = _mm_set1_epi64x(8); - let b = _mm_set1_epi64x(2); - let c = _mm_set1_epi64x(1); - let r = _mm_shrdv_epi64(a, b, c); - let e = _mm_set1_epi64x(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_mask_shrdv_epi64() { - let a = _mm_set1_epi64x(8); - let b = _mm_set1_epi64x(2); - let c = _mm_set1_epi64x(1); - let r = _mm_mask_shrdv_epi64(a, 0, b, c); - assert_eq_m128i(r, a); - let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c); - let e = _mm_set1_epi64x(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_maskz_shrdv_epi64() { - let a = _mm_set1_epi64x(8); - let b = _mm_set1_epi64x(2); - let c = _mm_set1_epi64x(1); - let r = _mm_maskz_shrdv_epi64(0, a, b, c); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c); - let e = _mm_set1_epi64x(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_shrdv_epi32() { - let a = _mm512_set1_epi32(8); - let b = _mm512_set1_epi32(2); - let c = _mm512_set1_epi32(1); - let r = _mm512_shrdv_epi32(a, b, c); - let e = _mm512_set1_epi32(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_mask_shrdv_epi32() { - let a = _mm512_set1_epi32(8); - let b = _mm512_set1_epi32(2); - let c = _mm512_set1_epi32(1); - let r = _mm512_mask_shrdv_epi32(a, 0, b, c); - assert_eq_m512i(r, a); - let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c); - let e = _mm512_set1_epi32(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_maskz_shrdv_epi32() { - let a = _mm512_set1_epi32(8); - let b = _mm512_set1_epi32(2); - let c = _mm512_set1_epi32(1); - let r = _mm512_maskz_shrdv_epi32(0, a, b, c); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c); - let e = _mm512_set1_epi32(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_shrdv_epi32() { - let a = _mm256_set1_epi32(8); - let b = _mm256_set1_epi32(2); - let c = _mm256_set1_epi32(1); - let r = _mm256_shrdv_epi32(a, b, c); - let e = _mm256_set1_epi32(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_mask_shrdv_epi32() { - let a = _mm256_set1_epi32(8); - let b = _mm256_set1_epi32(2); - let c = _mm256_set1_epi32(1); - let r = _mm256_mask_shrdv_epi32(a, 0, b, c); - assert_eq_m256i(r, a); - let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c); - let e = _mm256_set1_epi32(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_maskz_shrdv_epi32() { - let a = _mm256_set1_epi32(8); - let b = _mm256_set1_epi32(2); - let c = _mm256_set1_epi32(1); - let r = _mm256_maskz_shrdv_epi32(0, a, b, c); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c); - let e = _mm256_set1_epi32(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_shrdv_epi32() { - let a = _mm_set1_epi32(8); - let b = _mm_set1_epi32(2); - let c = _mm_set1_epi32(1); - let r = _mm_shrdv_epi32(a, b, c); - let e = _mm_set1_epi32(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_mask_shrdv_epi32() { - let a = _mm_set1_epi32(8); - let b = _mm_set1_epi32(2); - let c = _mm_set1_epi32(1); - let r = _mm_mask_shrdv_epi32(a, 0, b, c); - assert_eq_m128i(r, a); - let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c); - let e = _mm_set1_epi32(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_maskz_shrdv_epi32() { - let a = _mm_set1_epi32(8); - let b = _mm_set1_epi32(2); - let c = _mm_set1_epi32(1); - let r = _mm_maskz_shrdv_epi32(0, a, b, c); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c); - let e = _mm_set1_epi32(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_shrdv_epi16() { - let a = _mm512_set1_epi16(8); - let b = _mm512_set1_epi16(2); - let c = _mm512_set1_epi16(1); - let r = _mm512_shrdv_epi16(a, b, c); - let e = _mm512_set1_epi16(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_mask_shrdv_epi16() { - let a = _mm512_set1_epi16(8); - let b = _mm512_set1_epi16(2); - let c = _mm512_set1_epi16(1); - let r = _mm512_mask_shrdv_epi16(a, 0, b, c); - assert_eq_m512i(r, a); - let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c); - let e = _mm512_set1_epi16(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2")] - unsafe fn test_mm512_maskz_shrdv_epi16() { - let a = _mm512_set1_epi16(8); - let b = _mm512_set1_epi16(2); - let c = _mm512_set1_epi16(1); - let r = _mm512_maskz_shrdv_epi16(0, a, b, c); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c); - let e = _mm512_set1_epi16(1); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_shrdv_epi16() { - let a = _mm256_set1_epi16(8); - let b = _mm256_set1_epi16(2); - let c = _mm256_set1_epi16(1); - let r = _mm256_shrdv_epi16(a, b, c); - let e = _mm256_set1_epi16(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_mask_shrdv_epi16() { - let a = _mm256_set1_epi16(8); - let b = _mm256_set1_epi16(2); - let c = _mm256_set1_epi16(1); - let r = _mm256_mask_shrdv_epi16(a, 0, b, c); - assert_eq_m256i(r, a); - let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c); - let e = _mm256_set1_epi16(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm256_maskz_shrdv_epi16() { - let a = _mm256_set1_epi16(8); - let b = _mm256_set1_epi16(2); - let c = _mm256_set1_epi16(1); - let r = _mm256_maskz_shrdv_epi16(0, a, b, c); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c); - let e = _mm256_set1_epi16(1); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_shrdv_epi16() { - let a = _mm_set1_epi16(8); - let b = _mm_set1_epi16(2); - let c = _mm_set1_epi16(1); - let r = _mm_shrdv_epi16(a, b, c); - let e = _mm_set1_epi16(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_mask_shrdv_epi16() { - let a = _mm_set1_epi16(8); - let b = _mm_set1_epi16(2); - let c = _mm_set1_epi16(1); - let r = _mm_mask_shrdv_epi16(a, 0, b, c); - assert_eq_m128i(r, a); - let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c); - let e = _mm_set1_epi16(1); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vbmi2,avx512vl")] - unsafe fn test_mm_maskz_shrdv_epi16() { - let a = _mm_set1_epi16(8); - let b = _mm_set1_epi16(2); - let c = _mm_set1_epi16(1); - let r = _mm_maskz_shrdv_epi16(0, a, b, c); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c); - let e = _mm_set1_epi16(1); - assert_eq_m128i(r, e); - } - */ + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdi_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let r = _mm512_shrdi_epi64(a, b, 1); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdi_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let r = _mm512_mask_shrdi_epi64(a, 0, a, b, 1); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdi_epi64(a, 0b11111111, a, b, 1); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdi_epi64() { + let a = _mm512_set1_epi64(8); + let b = _mm512_set1_epi64(2); + let r = _mm512_maskz_shrdi_epi64(0, a, b, 1); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdi_epi64(0b11111111, a, b, 1); + let e = _mm512_set1_epi64(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdi_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let r = _mm256_shrdi_epi64(a, b, 1); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdi_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let r = _mm256_mask_shrdi_epi64(a, 0, a, b, 1); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdi_epi64(a, 0b00001111, a, b, 1); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdi_epi64() { + let a = _mm256_set1_epi64x(8); + let b = _mm256_set1_epi64x(2); + let r = _mm256_maskz_shrdi_epi64(0, a, b, 1); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdi_epi64(0b00001111, a, b, 1); + let e = _mm256_set1_epi64x(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdi_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let r = _mm_shrdi_epi64(a, b, 1); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdi_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let r = _mm_mask_shrdi_epi64(a, 0, a, b, 1); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdi_epi64(a, 0b00000011, a, b, 1); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdi_epi64() { + let a = _mm_set1_epi64x(8); + let b = _mm_set1_epi64x(2); + let r = _mm_maskz_shrdi_epi64(0, a, b, 1); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdi_epi64(0b00000011, a, b, 1); + let e = _mm_set1_epi64x(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdi_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let r = _mm512_shrdi_epi32(a, b, 1); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdi_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let r = _mm512_mask_shrdi_epi32(a, 0, a, b, 1); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdi_epi32(a, 0b11111111_11111111, a, b, 1); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdi_epi32() { + let a = _mm512_set1_epi32(8); + let b = _mm512_set1_epi32(2); + let r = _mm512_maskz_shrdi_epi32(0, a, b, 1); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdi_epi32(0b11111111_11111111, a, b, 1); + let e = _mm512_set1_epi32(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdi_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let r = _mm256_shrdi_epi32(a, b, 1); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdi_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let r = _mm256_mask_shrdi_epi32(a, 0, a, b, 1); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdi_epi32(a, 0b11111111, a, b, 1); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdi_epi32() { + let a = _mm256_set1_epi32(8); + let b = _mm256_set1_epi32(2); + let r = _mm256_maskz_shrdi_epi32(0, a, b, 1); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdi_epi32(0b11111111, a, b, 1); + let e = _mm256_set1_epi32(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdi_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let r = _mm_shrdi_epi32(a, b, 1); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdi_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let r = _mm_mask_shrdi_epi32(a, 0, a, b, 1); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdi_epi32(a, 0b00001111, a, b, 1); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdi_epi32() { + let a = _mm_set1_epi32(8); + let b = _mm_set1_epi32(2); + let r = _mm_maskz_shrdi_epi32(0, a, b, 1); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdi_epi32(0b00001111, a, b, 1); + let e = _mm_set1_epi32(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_shrdi_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let r = _mm512_shrdi_epi16(a, b, 1); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_mask_shrdi_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let r = _mm512_mask_shrdi_epi16(a, 0, a, b, 1); + assert_eq_m512i(r, a); + let r = _mm512_mask_shrdi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 1); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2")] + unsafe fn test_mm512_maskz_shrdi_epi16() { + let a = _mm512_set1_epi16(8); + let b = _mm512_set1_epi16(2); + let r = _mm512_maskz_shrdi_epi16(0, a, b, 1); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_shrdi_epi16(0b11111111_11111111_11111111_11111111, a, b, 1); + let e = _mm512_set1_epi16(1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_shrdi_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let r = _mm256_shrdi_epi16(a, b, 1); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_mask_shrdi_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let r = _mm256_mask_shrdi_epi16(a, 0, a, b, 1); + assert_eq_m256i(r, a); + let r = _mm256_mask_shrdi_epi16(a, 0b11111111_11111111, a, b, 1); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm256_maskz_shrdi_epi16() { + let a = _mm256_set1_epi16(8); + let b = _mm256_set1_epi16(2); + let r = _mm256_maskz_shrdi_epi16(0, a, b, 1); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_shrdi_epi16(0b11111111_11111111, a, b, 1); + let e = _mm256_set1_epi16(1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_shrdi_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let r = _mm_shrdi_epi16(a, b, 1); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_mask_shrdi_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let r = _mm_mask_shrdi_epi16(a, 0, a, b, 1); + assert_eq_m128i(r, a); + let r = _mm_mask_shrdi_epi16(a, 0b11111111, a, b, 1); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vbmi2,avx512vl")] + unsafe fn test_mm_maskz_shrdi_epi16() { + let a = _mm_set1_epi16(8); + let b = _mm_set1_epi16(2); + let r = _mm_maskz_shrdi_epi16(0, a, b, 1); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_shrdi_epi16(0b11111111, a, b, 1); + let e = _mm_set1_epi16(1); + assert_eq_m128i(r, e); + } } From 0b42b6eb4c6aae70e99a2f074991f6d934aed4cc Mon Sep 17 00:00:00 2001 From: jirong Date: Mon, 11 Jan 2021 14:35:48 +0000 Subject: [PATCH 08/10] test1 --- crates/core_arch/avx512vbmi2.md | 1 - crates/core_arch/src/x86/avx512vnni.rs | 849 ++++++++++++++++++ .../core_arch/src/x86/avx512vp2intersect.rs | 43 + 3 files changed, 892 insertions(+), 1 deletion(-) create mode 100644 crates/core_arch/src/x86/avx512vnni.rs create mode 100644 crates/core_arch/src/x86/avx512vp2intersect.rs diff --git a/crates/core_arch/avx512vbmi2.md b/crates/core_arch/avx512vbmi2.md index 43d497108e..4bb6a0ed0c 100644 --- a/crates/core_arch/avx512vbmi2.md +++ b/crates/core_arch/avx512vbmi2.md @@ -150,5 +150,4 @@ * [x] [`_mm512_mask_shrdv_epi64`] * [x] [`_mm512_maskz_shrdv_epi64`] * [x] [`_mm512_shrdv_epi64`] -

diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs new file mode 100644 index 0000000000..daa3c896a6 --- /dev/null +++ b/crates/core_arch/src/x86/avx512vnni.rs @@ -0,0 +1,849 @@ +use crate::{ + core_arch::{simd::*, simd_llvm::*, x86::*}, + mem::transmute, +}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssd_epi32&expand=2219) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { + transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssd_epi32&expand=2220) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); + transmute(simd_select_bitmask(k, r, src.as_i32x16())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssd_epi32&expand=2221) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_epi32&expand=2216) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssd_epi32&expand=2217) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); + transmute(simd_select_bitmask(k, r, src.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssd_epi32&expand=2218) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_epi32&expand=2213) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssd_epi32&expand=2214) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpwssd_epi32(src, a, b).as_i32x4(); + transmute(simd_select_bitmask(k, r, src.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssd_epi32&expand=2215) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssd))] +pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpwssd_epi32(src, a, b).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssds_epi32&expand=2228) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { + transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssds_epi32&expand=2229) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); + transmute(simd_select_bitmask(k, r, src.as_i32x16())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssds_epi32&expand=2230) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_epi32&expand=2225) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssds_epi32&expand=2226) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); + transmute(simd_select_bitmask(k, r, src.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssds_epi32&expand=2227) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_epi32&expand=2222) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssds_epi32&expand=2223) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpwssds_epi32(src, a, b).as_i32x4(); + transmute(simd_select_bitmask(k, r, src.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssds_epi32&expand=2224) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpwssds))] +pub unsafe fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpwssds_epi32(src, a, b).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusd_epi32&expand=2201) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { + transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusd_epi32&expand=2202) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); + transmute(simd_select_bitmask(k, r, src.as_i32x16())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusd_epi32&expand=2203) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_epi32&expand=2198) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusd_epi32&expand=2199) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); + transmute(simd_select_bitmask(k, r, src.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusd_epi32&expand=2200) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_epi32&expand=2195) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusd_epi32&expand=2196) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpbusd_epi32(src, a, b).as_i32x4(); + transmute(simd_select_bitmask(k, r, src.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusd_epi32&expand=2197) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusd))] +pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpbusd_epi32(src, a, b).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusds_epi32&expand=2210) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { + transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusds_epi32&expand=2211) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); + transmute(simd_select_bitmask(k, r, src.as_i32x16())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusds_epi32&expand=2212) +#[inline] +#[target_feature(enable = "avx512vnni")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { + let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_epi32&expand=2207) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusds_epi32&expand=2208) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); + transmute(simd_select_bitmask(k, r, src.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusds_epi32&expand=2209) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { + let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); + let zero = _mm256_setzero_si256().as_i32x8(); + transmute(simd_select_bitmask(k, r, zero)) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_epi32&expand=2204) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusds_epi32&expand=2205) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpbusds_epi32(src, a, b).as_i32x4(); + transmute(simd_select_bitmask(k, r, src.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusds_epi32&expand=2206) +#[inline] +#[target_feature(enable = "avx512vnni,avx512vl")] +#[cfg_attr(test, assert_instr(vpdpbusds))] +pub unsafe fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { + let r = _mm_dpbusds_epi32(src, a, b).as_i32x4(); + let zero = _mm_setzero_si128().as_i32x4(); + transmute(simd_select_bitmask(k, r, zero)) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.vpdpwssd.512"] + fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.vpdpwssd.256"] + fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx512.vpdpwssd.128"] + fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.avx512.vpdpwssds.512"] + fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.vpdpwssds.256"] + fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx512.vpdpwssds.128"] + fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.avx512.vpdpbusd.512"] + fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.vpdpbusd.256"] + fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx512.vpdpbusd.128"] + fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.avx512.vpdpbusds.512"] + fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.vpdpbusds.256"] + fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx512.vpdpbusds.128"] + fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; +} + +#[cfg(test)] +mod tests { + + use crate::core_arch::x86::*; + use stdarch_test::simd_test; + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_dpwssd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_dpwssd_epi32(src, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_mask_dpwssd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b); + assert_eq_m512i(r, src); + let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_maskz_dpwssd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_dpwssd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_dpwssd_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_mask_dpwssd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b); + assert_eq_m256i(r, src); + let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_maskz_dpwssd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_dpwssd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_dpwssd_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_mask_dpwssd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b); + assert_eq_m128i(r, src); + let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_maskz_dpwssd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_dpwssds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_dpwssds_epi32(src, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_mask_dpwssds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b); + assert_eq_m512i(r, src); + let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_maskz_dpwssds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<16|1<<0); + let b = _mm512_set1_epi32(1<<16|1<<0); + let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b); + let e = _mm512_set1_epi32(3); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_dpwssds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_dpwssds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_mask_dpwssds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b); + assert_eq_m256i(r, src); + let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_maskz_dpwssds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<16|1<<0); + let b = _mm256_set1_epi32(1<<16|1<<0); + let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_dpwssds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_dpwssds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_mask_dpwssds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b); + assert_eq_m128i(r, src); + let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_maskz_dpwssds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<16|1<<0); + let b = _mm_set1_epi32(1<<16|1<<0); + let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_dpbusd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_dpbusd_epi32(src, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_mask_dpbusd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b); + assert_eq_m512i(r, src); + let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_maskz_dpbusd_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_dpbusd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_dpbusd_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_mask_dpbusd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b); + assert_eq_m256i(r, src); + let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_maskz_dpbusd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_dpbusd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_dpbusd_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_mask_dpbusd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b); + assert_eq_m128i(r, src); + let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_maskz_dpbusd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_dpbusds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_dpbusds_epi32(src, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_mask_dpbusds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b); + assert_eq_m512i(r, src); + let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni")] + unsafe fn test_mm512_maskz_dpbusds_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b); + let e = _mm512_set1_epi32(5); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_dpbusds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_dpbusds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_mask_dpbusds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b); + assert_eq_m256i(r, src); + let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm256_maskz_dpbusds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_dpbusds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_dpbusds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_mask_dpbusds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b); + assert_eq_m128i(r, src); + let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512vnni,avx512vl")] + unsafe fn test_mm_maskz_dpbusds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); + let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b); + assert_eq_m128i(r, _mm_setzero_si128()); + let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } +} diff --git a/crates/core_arch/src/x86/avx512vp2intersect.rs b/crates/core_arch/src/x86/avx512vp2intersect.rs new file mode 100644 index 0000000000..211f0b25dc --- /dev/null +++ b/crates/core_arch/src/x86/avx512vp2intersect.rs @@ -0,0 +1,43 @@ +use crate::{ + core_arch::{simd::*, /*simd_llvm::*,*/ x86::*}, + mem::transmute, +}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi32&expand=2) +#[inline] +#[target_feature(enable = "avx512vp2intersect,avx512f")] +#[cfg_attr(test, assert_instr(vp2intersectd))] +pub unsafe fn _mm512_2intersect_epi32(a: __m512i, b: __m512i, k1: *mut u16, k2: *mut u16) { + transmute(vp2intersectd(a.as_i32x16(), b.as_i32x16(), k1, k2)) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.vp2intersect.d.512"] + fn vp2intersectd(a: i32x16, b: i32x16, k1: *mut u16, k2: *mut u16); +} + +#[cfg(test)] +mod tests { + + use crate::core_arch::x86::*; + use stdarch_test::simd_test; + + #[simd_test(enable = "avx512vp2intersect,avx512f")] + unsafe fn test_mm512_2intersect_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(1); + let mut r1: u16 = 0; + let mut r2: u16 = 0; + _mm512_2intersect_epi32(a, b, &mut r1 as *mut _ as *mut u16, &mut r2 as *mut _ as *mut u16); + //assert_eq!(r1, 0b11111111_11111111); + //assert_eq!(r2, 0b11111111_11111111); + assert_eq!(r1, 0); + assert_eq!(r2, 0); + } +} From 3ee3e64ce16cc1461e290e95682a3403df938457 Mon Sep 17 00:00:00 2001 From: jirong Date: Mon, 11 Jan 2021 14:36:28 +0000 Subject: [PATCH 09/10] test1 --- crates/core_arch/src/x86/avx512vnni.rs | 849 ------------------ .../core_arch/src/x86/avx512vp2intersect.rs | 43 - 2 files changed, 892 deletions(-) delete mode 100644 crates/core_arch/src/x86/avx512vnni.rs delete mode 100644 crates/core_arch/src/x86/avx512vp2intersect.rs diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs deleted file mode 100644 index daa3c896a6..0000000000 --- a/crates/core_arch/src/x86/avx512vnni.rs +++ /dev/null @@ -1,849 +0,0 @@ -use crate::{ - core_arch::{simd::*, simd_llvm::*, x86::*}, - mem::transmute, -}; - -#[cfg(test)] -use stdarch_test::assert_instr; - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssd_epi32&expand=2219) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssd_epi32&expand=2220) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssd_epi32&expand=2221) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_epi32&expand=2216) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssd_epi32&expand=2217) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssd_epi32&expand=2218) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); - let zero = _mm256_setzero_si256().as_i32x8(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_epi32&expand=2213) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssd_epi32&expand=2214) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpwssd_epi32(src, a, b).as_i32x4(); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssd_epi32&expand=2215) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssd))] -pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpwssd_epi32(src, a, b).as_i32x4(); - let zero = _mm_setzero_si128().as_i32x4(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssds_epi32&expand=2228) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssds_epi32&expand=2229) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssds_epi32&expand=2230) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_epi32&expand=2225) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssds_epi32&expand=2226) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssds_epi32&expand=2227) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); - let zero = _mm256_setzero_si256().as_i32x8(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_epi32&expand=2222) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssds_epi32&expand=2223) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpwssds_epi32(src, a, b).as_i32x4(); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssds_epi32&expand=2224) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpwssds))] -pub unsafe fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpwssds_epi32(src, a, b).as_i32x4(); - let zero = _mm_setzero_si128().as_i32x4(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusd_epi32&expand=2201) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusd_epi32&expand=2202) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusd_epi32&expand=2203) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_epi32&expand=2198) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusd_epi32&expand=2199) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusd_epi32&expand=2200) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); - let zero = _mm256_setzero_si256().as_i32x8(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_epi32&expand=2195) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusd_epi32&expand=2196) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpbusd_epi32(src, a, b).as_i32x4(); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusd_epi32&expand=2197) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusd))] -pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpbusd_epi32(src, a, b).as_i32x4(); - let zero = _mm_setzero_si128().as_i32x4(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusds_epi32&expand=2210) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { - transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusds_epi32&expand=2211) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); - transmute(simd_select_bitmask(k, r, src.as_i32x16())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusds_epi32&expand=2212) -#[inline] -#[target_feature(enable = "avx512vnni")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i { - let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_epi32&expand=2207) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { - transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusds_epi32&expand=2208) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); - transmute(simd_select_bitmask(k, r, src.as_i32x8())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusds_epi32&expand=2209) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i { - let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); - let zero = _mm256_setzero_si256().as_i32x8(); - transmute(simd_select_bitmask(k, r, zero)) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_epi32&expand=2204) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { - transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusds_epi32&expand=2205) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpbusds_epi32(src, a, b).as_i32x4(); - transmute(simd_select_bitmask(k, r, src.as_i32x4())) -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusds_epi32&expand=2206) -#[inline] -#[target_feature(enable = "avx512vnni,avx512vl")] -#[cfg_attr(test, assert_instr(vpdpbusds))] -pub unsafe fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { - let r = _mm_dpbusds_epi32(src, a, b).as_i32x4(); - let zero = _mm_setzero_si128().as_i32x4(); - transmute(simd_select_bitmask(k, r, zero)) -} - -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.x86.avx512.vpdpwssd.512"] - fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.vpdpwssd.256"] - fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.vpdpwssd.128"] - fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.avx512.vpdpwssds.512"] - fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.vpdpwssds.256"] - fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.vpdpwssds.128"] - fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.avx512.vpdpbusd.512"] - fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.vpdpbusd.256"] - fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.vpdpbusd.128"] - fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; - - #[link_name = "llvm.x86.avx512.vpdpbusds.512"] - fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.vpdpbusds.256"] - fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx512.vpdpbusds.128"] - fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; -} - -#[cfg(test)] -mod tests { - - use crate::core_arch::x86::*; - use stdarch_test::simd_test; - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_dpwssd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_dpwssd_epi32(src, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_mask_dpwssd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b); - assert_eq_m512i(r, src); - let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_maskz_dpwssd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_dpwssd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_dpwssd_epi32(src, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_mask_dpwssd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b); - assert_eq_m256i(r, src); - let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_maskz_dpwssd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_dpwssd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_dpwssd_epi32(src, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_mask_dpwssd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b); - assert_eq_m128i(r, src); - let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_maskz_dpwssd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_dpwssds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_dpwssds_epi32(src, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_mask_dpwssds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b); - assert_eq_m512i(r, src); - let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_maskz_dpwssds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<16|1<<0); - let b = _mm512_set1_epi32(1<<16|1<<0); - let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b); - let e = _mm512_set1_epi32(3); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_dpwssds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_dpwssds_epi32(src, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_mask_dpwssds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b); - assert_eq_m256i(r, src); - let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_maskz_dpwssds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<16|1<<0); - let b = _mm256_set1_epi32(1<<16|1<<0); - let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b); - let e = _mm256_set1_epi32(3); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_dpwssds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_dpwssds_epi32(src, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_mask_dpwssds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b); - assert_eq_m128i(r, src); - let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_maskz_dpwssds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<16|1<<0); - let b = _mm_set1_epi32(1<<16|1<<0); - let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b); - let e = _mm_set1_epi32(3); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_dpbusd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_dpbusd_epi32(src, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_mask_dpbusd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b); - assert_eq_m512i(r, src); - let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_maskz_dpbusd_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_dpbusd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_dpbusd_epi32(src, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_mask_dpbusd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b); - assert_eq_m256i(r, src); - let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_maskz_dpbusd_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_dpbusd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_dpbusd_epi32(src, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_mask_dpbusd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b); - assert_eq_m128i(r, src); - let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_maskz_dpbusd_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_dpbusds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_dpbusds_epi32(src, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_mask_dpbusds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b); - assert_eq_m512i(r, src); - let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni")] - unsafe fn test_mm512_maskz_dpbusds_epi32() { - let src = _mm512_set1_epi32(1); - let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b); - assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b); - let e = _mm512_set1_epi32(5); - assert_eq_m512i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_dpbusds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_dpbusds_epi32(src, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_mask_dpbusds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b); - assert_eq_m256i(r, src); - let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm256_maskz_dpbusds_epi32() { - let src = _mm256_set1_epi32(1); - let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b); - assert_eq_m256i(r, _mm256_setzero_si256()); - let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b); - let e = _mm256_set1_epi32(5); - assert_eq_m256i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_dpbusds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_dpbusds_epi32(src, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_mask_dpbusds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b); - assert_eq_m128i(r, src); - let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } - - #[simd_test(enable = "avx512vnni,avx512vl")] - unsafe fn test_mm_maskz_dpbusds_epi32() { - let src = _mm_set1_epi32(1); - let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0); - let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b); - assert_eq_m128i(r, _mm_setzero_si128()); - let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b); - let e = _mm_set1_epi32(5); - assert_eq_m128i(r, e); - } -} diff --git a/crates/core_arch/src/x86/avx512vp2intersect.rs b/crates/core_arch/src/x86/avx512vp2intersect.rs deleted file mode 100644 index 211f0b25dc..0000000000 --- a/crates/core_arch/src/x86/avx512vp2intersect.rs +++ /dev/null @@ -1,43 +0,0 @@ -use crate::{ - core_arch::{simd::*, /*simd_llvm::*,*/ x86::*}, - mem::transmute, -}; - -#[cfg(test)] -use stdarch_test::assert_instr; - -/// Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi32&expand=2) -#[inline] -#[target_feature(enable = "avx512vp2intersect,avx512f")] -#[cfg_attr(test, assert_instr(vp2intersectd))] -pub unsafe fn _mm512_2intersect_epi32(a: __m512i, b: __m512i, k1: *mut u16, k2: *mut u16) { - transmute(vp2intersectd(a.as_i32x16(), b.as_i32x16(), k1, k2)) -} - -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.x86.avx512.vp2intersect.d.512"] - fn vp2intersectd(a: i32x16, b: i32x16, k1: *mut u16, k2: *mut u16); -} - -#[cfg(test)] -mod tests { - - use crate::core_arch::x86::*; - use stdarch_test::simd_test; - - #[simd_test(enable = "avx512vp2intersect,avx512f")] - unsafe fn test_mm512_2intersect_epi32() { - let a = _mm512_set1_epi32(1); - let b = _mm512_set1_epi32(1); - let mut r1: u16 = 0; - let mut r2: u16 = 0; - _mm512_2intersect_epi32(a, b, &mut r1 as *mut _ as *mut u16, &mut r2 as *mut _ as *mut u16); - //assert_eq!(r1, 0b11111111_11111111); - //assert_eq!(r2, 0b11111111_11111111); - assert_eq!(r1, 0); - assert_eq!(r2, 0); - } -} From fd3a1aea9af28bb51156855b0303ee12f9a11c53 Mon Sep 17 00:00:00 2001 From: jirong Date: Mon, 11 Jan 2021 15:54:51 +0000 Subject: [PATCH 10/10] remove wasm32 ci --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fdae25c903..ee75f8f022 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,7 +77,7 @@ jobs: - mips64-unknown-linux-gnuabi64 - mips64el-unknown-linux-gnuabi64 - s390x-unknown-linux-gnu - - wasm32-wasi + #- wasm32-wasi - i586-unknown-linux-gnu - x86_64-linux-android - arm-linux-androideabi @@ -130,8 +130,8 @@ jobs: disable_assert_instr: true - target: s390x-unknown-linux-gnu os: ubuntu-latest - - target: wasm32-wasi - os: ubuntu-latest + #- target: wasm32-wasi + # os: ubuntu-latest - target: aarch64-unknown-linux-gnu os: ubuntu-latest - target: x86_64-apple-darwin