From 6f697f71fcc53622e866707a0191e6d0baf44c9e Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Tue, 5 Jan 2021 00:57:19 +0000
Subject: [PATCH 01/10] compress_epi16

---
 crates/core_arch/src/x86/avx512vbmi.rs   |  2 -
 crates/core_arch/src/x86/avx512vbmi2.rs  | 74 ++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs          |  3 +
 crates/stdarch-verify/tests/x86-intel.rs |  3 +
 4 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 crates/core_arch/src/x86/avx512vbmi2.rs

diff --git a/crates/core_arch/src/x86/avx512vbmi.rs b/crates/core_arch/src/x86/avx512vbmi.rs
index 21437e3da2..f0ff75162f 100644
--- a/crates/core_arch/src/x86/avx512vbmi.rs
+++ b/crates/core_arch/src/x86/avx512vbmi.rs
@@ -438,8 +438,6 @@ mod tests {
     use stdarch_test::simd_test;
 
     use crate::core_arch::x86::*;
-    //use crate::hint::black_box;
-    //use crate::mem::{self};
 
     #[simd_test(enable = "avx512vbmi")]
     unsafe fn test_mm512_permutex2var_epi8() {
diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
new file mode 100644
index 0000000000..df9c03fe26
--- /dev/null
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -0,0 +1,74 @@
+use crate::core_arch::{simd::*, /*simd_llvm::*,*/ x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi16&expand=1192)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi16&expand=1193)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpcompressw(
+        a.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
+    fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
+    fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::x86::*;
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index b853451b49..68f1bd8991 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -678,6 +678,9 @@ pub use self::avx512ifma::*;
 mod avx512vbmi;
 pub use self::avx512vbmi::*;
 
+mod avx512vbmi2;
+pub use self::avx512vbmi2::*;
+
 mod avx512bitalg;
 pub use self::avx512bitalg::*;
 
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 601549d5dc..51b3d054a4 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -479,6 +479,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             // The XML file names VBMI as "avx512_vbmi", while Rust calls
             // it "avx512vbmi".
             "avx512_vbmi" => String::from("avx512vbmi"),
+            // The XML file names VBMI2 as "avx512_vbmi2", while Rust calls
+            // it "avx512vbmi2".
+            "avx512_vbmi2" => String::from("avx512vbmi2"),
             // Some AVX512f intrinsics are also supported by Knight's Corner.
             // The XML lists them as avx512f/kncni, but we are solely gating
             // them behind avx512f since we don't have a KNC feature yet.

From 903761fe744255f2c240d275f58cd38c634c4de6 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 8 Jan 2021 01:20:34 +0000
Subject: [PATCH 02/10] compress_epi8

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 256 ++++++++++++++++++++++++
 1 file changed, 256 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index df9c03fe26..9e651ab59c 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -27,13 +27,141 @@ pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
     ))
 }
 
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi16&expand=1190)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi16&expand=1191)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpcompressw256(
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi16&expand=1188)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi16&expand=1189)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressw))]
+pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpcompressw128(
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi8&expand=1210)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi8&expand=1211)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpcompressb(
+        a.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi8&expand=1208)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi8&expand=1209)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpcompressb256(
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi8&expand=1206)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi8&expand=1207)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpcompressb))]
+pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpcompressb128(
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
     fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
+    fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
+    fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
 
     #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
     fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
+    fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
+    fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
 }
 
 #[cfg(test)]
@@ -71,4 +199,132 @@ mod tests {
         );
         assert_eq_m512i(r, e);
     }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_compress_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_compress_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_compress_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_compress_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+            33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_compress_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+            33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_compress_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+            1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_compress_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+            1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_compress_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_compress_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+        assert_eq_m128i(r, e);
+    }
 }

From 33ba1beb23cac61e3368b0068f5c17f1819d68fe Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Fri, 8 Jan 2021 18:32:07 +0000
Subject: [PATCH 03/10] expand_epi16, expand_epi8

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 313 ++++++++++++++++++++++++
 1 file changed, 313 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 9e651ab59c..d00da42c16 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -147,6 +147,150 @@ pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
     ))
 }
 
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi16&expand=2310)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi16&expand=2311)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    transmute(vpexpandw(
+        a.as_i16x32(),
+        _mm512_setzero_si512().as_i16x32(),
+        k,
+    ))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi16&expand=2308)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi16&expand=2309)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    transmute(vpexpandw256(
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+        k,
+    ))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi16&expand=2306)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k))
+}
+
+/// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi16&expand=2307)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandw))]
+pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    transmute(vpexpandw128(
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi8&expand=2328)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi8&expand=2329)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    transmute(vpexpandb(
+        a.as_i8x64(),
+        _mm512_setzero_si512().as_i8x64(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi8&expand=2326)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi8&expand=2327)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    transmute(vpexpandb256(
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+        k,
+    ))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi8&expand=2324)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k))
+}
+
+/// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi8&expand=2325)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpexpandb))]
+pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    transmute(vpexpandb128(
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+        k,
+    ))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
@@ -162,6 +306,20 @@ extern "C" {
     fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
     #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
     fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
+    fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
+    fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
+    fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
+
+    #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
+    fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
+    fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
+    #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
+    fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
 }
 
 #[cfg(test)]
@@ -327,4 +485,159 @@ mod tests {
         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi16() {
+        let src = _mm512_set1_epi16(200);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(
+            200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
+            200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi16() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+                                 0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi16() {
+        let src = _mm256_set1_epi16(200);
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
+        let e = _mm256_set_epi16(
+            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
+        let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi16() {
+        let src = _mm_set1_epi16(200);
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_mask_expand_epi16(src, 0b01010101, a);
+        let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm_maskz_expand_epi16(0b01010101, a);
+        let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_expand_epi8() {
+        let src = _mm512_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_mask_expand_epi8(
+            src,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
+            100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
+            100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
+            100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
+        let r = _mm512_maskz_expand_epi8(
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
+            a,
+        );
+        #[rustfmt::skip]
+        let e = _mm512_set_epi8(
+            0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
+            0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
+            0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
+            0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_expand_epi8() {
+        let src = _mm256_set1_epi8(100);
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
+            100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_expand_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+        let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
+        #[rustfmt::skip]
+        let e = _mm256_set_epi8(
+            0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
+            0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_expand_epi8() {
+        let src = _mm_set1_epi8(100);
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
+        let e = _mm_set_epi8(
+            100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_expand_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
+        let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
+        assert_eq_m128i(r, e);
+    }
 }

From 5b6f90317a71607f061ab2ba357d488438fc510d Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 9 Jan 2021 01:43:06 +0000
Subject: [PATCH 04/10] shldv_epi64,epi32,epi16: mm512,mm256,mm

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 1030 ++++++++++++++++++++++-
 1 file changed, 1029 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index d00da42c16..b8bd18ef6e 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -1,4 +1,4 @@
-use crate::core_arch::{simd::*, /*simd_llvm::*,*/ x86::*};
+use crate::core_arch::{simd::*, simd_llvm::*, x86::*};
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
@@ -291,6 +291,690 @@ pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
     ))
 }
 
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi64&expand=5087)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi64&expand=5085)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_mask_shldv_epi64(
+    a: __m512i,
+    k: __mmask8,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi64&expand=5086)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi64&expand=5084)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi64&expand=5082)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_mask_shldv_epi64(
+    a: __m256i,
+    k: __mmask8,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi64&expand=5083)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi64&expand=5081)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi64&expand=5079)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_mask_shldv_epi64(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi64&expand=5080)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvq))]
+pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi32&expand=5078)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi32&expand=5076)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_mask_shldv_epi32(
+    a: __m512i,
+    k: __mmask16,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi32&expand=5077)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi32&expand=5075)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi32&expand=5073)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_mask_shldv_epi32(
+    a: __m256i,
+    k: __mmask8,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi32&expand=5074)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi32&expand=5072)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi32&expand=5070)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_mask_shldv_epi32(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi32&expand=5071)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvd))]
+pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi16&expand=5069)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi16&expand=5067)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_mask_shldv_epi16(
+    a: __m512i,
+    k: __mmask32,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi16&expand=5068)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi16&expand=5066)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi16&expand=5064)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_mask_shldv_epi16(
+    a: __m256i,
+    k: __mmask16,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi16&expand=5065)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi16&expand=5063)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi16&expand=5061)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_mask_shldv_epi16(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi16&expand=5062)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldvw))]
+pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi64&expand=5141)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi64&expand=5139)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_mask_shrdv_epi64(
+    a: __m512i,
+    k: __mmask8,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi64&expand=5140)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi64&expand=5138)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi64&expand=5136)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_mask_shrdv_epi64(
+    a: __m256i,
+    k: __mmask8,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi64&expand=5137)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi64&expand=5135)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi64&expand=5133)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_mask_shrdv_epi64(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi64&expand=5134)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvq))]
+pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi32&expand=5132)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi32&expand=5130)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_mask_shrdv_epi32(
+    a: __m512i,
+    k: __mmask16,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi32&expand=5131)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi32&expand=5129)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi32&expand=5127)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_mask_shrdv_epi32(
+    a: __m256i,
+    k: __mmask8,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi32&expand=5128)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi32&expand=5126)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi32&expand=5124)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_mask_shrdv_epi32(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi32&expand=5125)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvd))]
+pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi16&expand=5123)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    transmute(vpshrdvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi16&expand=5121)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_mask_shrdv_epi16(
+    a: __m512i,
+    k: __mmask32,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
+    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi16&expand=5122)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi16&expand=5120)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    transmute(vpshrdvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi16&expand=5118)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_mask_shrdv_epi16(
+    a: __m256i,
+    k: __mmask16,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
+    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi16&expand=5119)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi16&expand=5117)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    transmute(vpshrdvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi16&expand=5115)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_mask_shrdv_epi16(
+    a: __m128i,
+    k: __mmask8,
+    b: __m128i,
+    c: __m128i,
+) -> __m128i {
+    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi16&expand=5116)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshrdvw))]
+pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
@@ -320,6 +1004,44 @@ extern "C" {
     fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
     #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
     fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
+
+    #[link_name = "llvm.fshl.v8i64"]
+    fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshl.v4i64"]
+    fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshl.v2i64"]
+    fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshl.v16i32"]
+    fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshl.v8i32"]
+    fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshl.v4i32"]
+    fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshl.v32i16"]
+    fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshl.v16i16"]
+    fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshl.v8i16"]
+    fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
+
+    #[link_name = "llvm.fshr.v8i64"]
+    fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
+    #[link_name = "llvm.fshr.v4i64"]
+    fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
+    #[link_name = "llvm.fshr.v2i64"]
+    fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
+    #[link_name = "llvm.fshr.v16i32"]
+    fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
+    #[link_name = "llvm.fshr.v8i32"]
+    fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
+    #[link_name = "llvm.fshr.v4i32"]
+    fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
+    #[link_name = "llvm.fshr.v32i16"]
+    fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
+    #[link_name = "llvm.fshr.v16i16"]
+    fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
+    #[link_name = "llvm.fshr.v8i16"]
+    fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
 }
 
 #[cfg(test)]
@@ -640,4 +1362,310 @@ mod tests {
         let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1<<63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_shldv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1<<63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1<<63);
+        let c = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1<<63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_shldv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1<<63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1<<63);
+        let c = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1<<63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_shldv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1<<63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_mask_shldv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1<<63);
+        let c = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shldv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1<<31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_shldv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1<<31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1<<31);
+        let c = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1<<31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_shldv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1<<31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1<<31);
+        let c = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1<<31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_shldv_epi32(a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1<<31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_mask_shldv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1<<31);
+        let c = _mm_set1_epi32(2);
+        let r = _mm_maskz_shldv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1<<15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_shldv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1<<15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldv_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1<<15);
+        let c = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1<<15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_shldv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1<<15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldv_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1<<15);
+        let c = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1<<15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_shldv_epi16(a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1<<15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_mask_shldv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldv_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1<<15);
+        let c = _mm_set1_epi16(2);
+        let r = _mm_maskz_shldv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
 }

From adc972a4dba917ae95938d6d26e51aa2836c142d Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 9 Jan 2021 14:22:07 +0000
Subject: [PATCH 05/10] shrdv_epi64,epi32,epi16: mm512,mm256,mm

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 528 +++++++++++++++++-------
 1 file changed, 387 insertions(+), 141 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index b8bd18ef6e..94ce39496b 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -307,12 +307,7 @@ pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm512_mask_shldv_epi64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
     transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
 }
@@ -345,12 +340,7 @@ pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm256_mask_shldv_epi64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
     transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
 }
@@ -383,12 +373,7 @@ pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm_mask_shldv_epi64(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
     transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
 }
@@ -421,12 +406,7 @@ pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm512_mask_shldv_epi32(
-    a: __m512i,
-    k: __mmask16,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
     transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
 }
@@ -437,7 +417,12 @@ pub unsafe fn _mm512_mask_shldv_epi32(
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+pub unsafe fn _mm512_maskz_shldv_epi32(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
     let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -459,12 +444,7 @@ pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm256_mask_shldv_epi32(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
     transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
 }
@@ -497,12 +477,7 @@ pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm_mask_shldv_epi32(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
     transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
 }
@@ -535,12 +510,7 @@ pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm512_mask_shldv_epi16(
-    a: __m512i,
-    k: __mmask32,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
     transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
 }
@@ -551,7 +521,12 @@ pub unsafe fn _mm512_mask_shldv_epi16(
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+pub unsafe fn _mm512_maskz_shldv_epi16(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
     let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -573,12 +548,7 @@ pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm256_mask_shldv_epi16(
-    a: __m256i,
-    k: __mmask16,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
     transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
 }
@@ -589,7 +559,12 @@ pub unsafe fn _mm256_mask_shldv_epi16(
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+pub unsafe fn _mm256_maskz_shldv_epi16(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
     let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -611,12 +586,7 @@ pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm_mask_shldv_epi16(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
     transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
 }
@@ -649,12 +619,7 @@ pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm512_mask_shrdv_epi64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
     transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
 }
@@ -687,12 +652,7 @@ pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm256_mask_shrdv_epi64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
     transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
 }
@@ -725,12 +685,7 @@ pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm_mask_shrdv_epi64(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
     transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
 }
@@ -763,12 +718,7 @@ pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm512_mask_shrdv_epi32(
-    a: __m512i,
-    k: __mmask16,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
     transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
 }
@@ -779,7 +729,12 @@ pub unsafe fn _mm512_mask_shrdv_epi32(
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+pub unsafe fn _mm512_maskz_shrdv_epi32(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
     let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -801,12 +756,7 @@ pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm256_mask_shrdv_epi32(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
     transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
 }
@@ -839,12 +789,7 @@ pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm_mask_shrdv_epi32(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
     transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
 }
@@ -877,12 +822,7 @@ pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm512_mask_shrdv_epi16(
-    a: __m512i,
-    k: __mmask32,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
+pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
     let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
     transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
 }
@@ -893,7 +833,12 @@ pub unsafe fn _mm512_mask_shrdv_epi16(
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+pub unsafe fn _mm512_maskz_shrdv_epi16(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+) -> __m512i {
     let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -915,12 +860,7 @@ pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm256_mask_shrdv_epi16(
-    a: __m256i,
-    k: __mmask16,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
+pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
     let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
     transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
 }
@@ -931,7 +871,12 @@ pub unsafe fn _mm256_mask_shrdv_epi16(
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+pub unsafe fn _mm256_maskz_shrdv_epi16(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+) -> __m256i {
     let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -953,12 +898,7 @@ pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm_mask_shrdv_epi16(
-    a: __m128i,
-    k: __mmask8,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
+pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
     let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
     transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
 }
@@ -1366,7 +1306,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_shldv_epi64() {
         let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1<<63);
+        let b = _mm512_set1_epi64(1 << 63);
         let c = _mm512_set1_epi64(2);
         let r = _mm512_shldv_epi64(a, b, c);
         let e = _mm512_set1_epi64(6);
@@ -1376,7 +1316,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_mask_shldv_epi64() {
         let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1<<63);
+        let b = _mm512_set1_epi64(1 << 63);
         let c = _mm512_set1_epi64(2);
         let r = _mm512_mask_shldv_epi64(a, 0, b, c);
         assert_eq_m512i(r, a);
@@ -1388,7 +1328,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_maskz_shldv_epi64() {
         let a = _mm512_set1_epi64(1);
-        let b = _mm512_set1_epi64(1<<63);
+        let b = _mm512_set1_epi64(1 << 63);
         let c = _mm512_set1_epi64(2);
         let r = _mm512_maskz_shldv_epi64(0, a, b, c);
         assert_eq_m512i(r, _mm512_setzero_si512());
@@ -1400,7 +1340,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_shldv_epi64() {
         let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1<<63);
+        let b = _mm256_set1_epi64x(1 << 63);
         let c = _mm256_set1_epi64x(2);
         let r = _mm256_shldv_epi64(a, b, c);
         let e = _mm256_set1_epi64x(6);
@@ -1410,7 +1350,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_mask_shldv_epi64() {
         let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1<<63);
+        let b = _mm256_set1_epi64x(1 << 63);
         let c = _mm256_set1_epi64x(2);
         let r = _mm256_mask_shldv_epi64(a, 0, b, c);
         assert_eq_m256i(r, a);
@@ -1422,7 +1362,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_maskz_shldv_epi64() {
         let a = _mm256_set1_epi64x(1);
-        let b = _mm256_set1_epi64x(1<<63);
+        let b = _mm256_set1_epi64x(1 << 63);
         let c = _mm256_set1_epi64x(2);
         let r = _mm256_maskz_shldv_epi64(0, a, b, c);
         assert_eq_m256i(r, _mm256_setzero_si256());
@@ -1434,7 +1374,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_shldv_epi64() {
         let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1<<63);
+        let b = _mm_set1_epi64x(1 << 63);
         let c = _mm_set1_epi64x(2);
         let r = _mm_shldv_epi64(a, b, c);
         let e = _mm_set1_epi64x(6);
@@ -1444,7 +1384,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_mask_shldv_epi64() {
         let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1<<63);
+        let b = _mm_set1_epi64x(1 << 63);
         let c = _mm_set1_epi64x(2);
         let r = _mm_mask_shldv_epi64(a, 0, b, c);
         assert_eq_m128i(r, a);
@@ -1456,7 +1396,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_maskz_shldv_epi64() {
         let a = _mm_set1_epi64x(1);
-        let b = _mm_set1_epi64x(1<<63);
+        let b = _mm_set1_epi64x(1 << 63);
         let c = _mm_set1_epi64x(2);
         let r = _mm_maskz_shldv_epi64(0, a, b, c);
         assert_eq_m128i(r, _mm_setzero_si128());
@@ -1468,7 +1408,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_shldv_epi32() {
         let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1<<31);
+        let b = _mm512_set1_epi32(1 << 31);
         let c = _mm512_set1_epi32(2);
         let r = _mm512_shldv_epi32(a, b, c);
         let e = _mm512_set1_epi32(6);
@@ -1478,7 +1418,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_mask_shldv_epi32() {
         let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1<<31);
+        let b = _mm512_set1_epi32(1 << 31);
         let c = _mm512_set1_epi32(2);
         let r = _mm512_mask_shldv_epi32(a, 0, b, c);
         assert_eq_m512i(r, a);
@@ -1490,7 +1430,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_maskz_shldv_epi32() {
         let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1<<31);
+        let b = _mm512_set1_epi32(1 << 31);
         let c = _mm512_set1_epi32(2);
         let r = _mm512_maskz_shldv_epi32(0, a, b, c);
         assert_eq_m512i(r, _mm512_setzero_si512());
@@ -1502,7 +1442,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_shldv_epi32() {
         let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1<<31);
+        let b = _mm256_set1_epi32(1 << 31);
         let c = _mm256_set1_epi32(2);
         let r = _mm256_shldv_epi32(a, b, c);
         let e = _mm256_set1_epi32(6);
@@ -1512,7 +1452,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_mask_shldv_epi32() {
         let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1<<31);
+        let b = _mm256_set1_epi32(1 << 31);
         let c = _mm256_set1_epi32(2);
         let r = _mm256_mask_shldv_epi32(a, 0, b, c);
         assert_eq_m256i(r, a);
@@ -1524,7 +1464,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_maskz_shldv_epi32() {
         let a = _mm256_set1_epi32(1);
-        let b = _mm256_set1_epi32(1<<31);
+        let b = _mm256_set1_epi32(1 << 31);
         let c = _mm256_set1_epi32(2);
         let r = _mm256_maskz_shldv_epi32(0, a, b, c);
         assert_eq_m256i(r, _mm256_setzero_si256());
@@ -1536,7 +1476,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_shldv_epi32() {
         let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1<<31);
+        let b = _mm_set1_epi32(1 << 31);
         let c = _mm_set1_epi32(2);
         let r = _mm_shldv_epi32(a, b, c);
         let e = _mm_set1_epi32(6);
@@ -1546,7 +1486,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_mask_shldv_epi32() {
         let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1<<31);
+        let b = _mm_set1_epi32(1 << 31);
         let c = _mm_set1_epi32(2);
         let r = _mm_mask_shldv_epi32(a, 0, b, c);
         assert_eq_m128i(r, a);
@@ -1558,7 +1498,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_maskz_shldv_epi32() {
         let a = _mm_set1_epi32(1);
-        let b = _mm_set1_epi32(1<<31);
+        let b = _mm_set1_epi32(1 << 31);
         let c = _mm_set1_epi32(2);
         let r = _mm_maskz_shldv_epi32(0, a, b, c);
         assert_eq_m128i(r, _mm_setzero_si128());
@@ -1570,7 +1510,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_shldv_epi16() {
         let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1<<15);
+        let b = _mm512_set1_epi16(1 << 15);
         let c = _mm512_set1_epi16(2);
         let r = _mm512_shldv_epi16(a, b, c);
         let e = _mm512_set1_epi16(6);
@@ -1580,7 +1520,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_mask_shldv_epi16() {
         let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1<<15);
+        let b = _mm512_set1_epi16(1 << 15);
         let c = _mm512_set1_epi16(2);
         let r = _mm512_mask_shldv_epi16(a, 0, b, c);
         assert_eq_m512i(r, a);
@@ -1592,7 +1532,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2")]
     unsafe fn test_mm512_maskz_shldv_epi16() {
         let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1<<15);
+        let b = _mm512_set1_epi16(1 << 15);
         let c = _mm512_set1_epi16(2);
         let r = _mm512_maskz_shldv_epi16(0, a, b, c);
         assert_eq_m512i(r, _mm512_setzero_si512());
@@ -1604,7 +1544,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_shldv_epi16() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1<<15);
+        let b = _mm256_set1_epi16(1 << 15);
         let c = _mm256_set1_epi16(2);
         let r = _mm256_shldv_epi16(a, b, c);
         let e = _mm256_set1_epi16(6);
@@ -1614,7 +1554,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_mask_shldv_epi16() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1<<15);
+        let b = _mm256_set1_epi16(1 << 15);
         let c = _mm256_set1_epi16(2);
         let r = _mm256_mask_shldv_epi16(a, 0, b, c);
         assert_eq_m256i(r, a);
@@ -1626,7 +1566,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm256_maskz_shldv_epi16() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1<<15);
+        let b = _mm256_set1_epi16(1 << 15);
         let c = _mm256_set1_epi16(2);
         let r = _mm256_maskz_shldv_epi16(0, a, b, c);
         assert_eq_m256i(r, _mm256_setzero_si256());
@@ -1638,7 +1578,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_shldv_epi16() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1<<15);
+        let b = _mm_set1_epi16(1 << 15);
         let c = _mm_set1_epi16(2);
         let r = _mm_shldv_epi16(a, b, c);
         let e = _mm_set1_epi16(6);
@@ -1648,7 +1588,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_mask_shldv_epi16() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1<<15);
+        let b = _mm_set1_epi16(1 << 15);
         let c = _mm_set1_epi16(2);
         let r = _mm_mask_shldv_epi16(a, 0, b, c);
         assert_eq_m128i(r, a);
@@ -1660,7 +1600,7 @@ mod tests {
     #[simd_test(enable = "avx512vbmi2,avx512vl")]
     unsafe fn test_mm_maskz_shldv_epi16() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1<<15);
+        let b = _mm_set1_epi16(1 << 15);
         let c = _mm_set1_epi16(2);
         let r = _mm_maskz_shldv_epi16(0, a, b, c);
         assert_eq_m128i(r, _mm_setzero_si128());
@@ -1668,4 +1608,310 @@ mod tests {
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_shrdv_epi64(a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let c = _mm512_set1_epi64(1);
+        let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_shrdv_epi64(a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let c = _mm256_set1_epi64x(1);
+        let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_shrdv_epi64(a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_mask_shrdv_epi64(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let c = _mm_set1_epi64x(1);
+        let r = _mm_maskz_shrdv_epi64(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_shrdv_epi32(a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let c = _mm512_set1_epi32(1);
+        let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_shrdv_epi32(a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let c = _mm256_set1_epi32(1);
+        let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_shrdv_epi32(a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_mask_shrdv_epi32(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let c = _mm_set1_epi32(1);
+        let r = _mm_maskz_shrdv_epi32(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_shrdv_epi16(a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdv_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let c = _mm512_set1_epi16(1);
+        let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_shrdv_epi16(a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdv_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let c = _mm256_set1_epi16(1);
+        let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_shrdv_epi16(a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_mask_shrdv_epi16(a, 0, b, c);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdv_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let c = _mm_set1_epi16(1);
+        let r = _mm_maskz_shrdv_epi16(0, a, b, c);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
 }

From cf7064d3e73dc618d2eb8cdeaaa3290ad2245474 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sat, 9 Jan 2021 20:51:16 +0000
Subject: [PATCH 06/10] shldi_epi64,epi32,epi16: mm512,mm256,mm

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 1087 +++++++++++++++++++++++
 1 file changed, 1087 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 94ce39496b..78091daa91 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -915,6 +915,507 @@ pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shldi_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x8 = vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x8 = vpshldvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    );
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shldi_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x4 = vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x4 = vpshldvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    );
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shldi_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x2 = vpshldvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x2 = vpshldvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    );
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shldi_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x16 = vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x16 = vpshldvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    );
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shldi_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x8 = vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x8 = vpshldvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    );
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvd128(
+        a.as_i32x4(),
+        b.as_i32x4(),
+        _mm_set1_epi32(imm8).as_i32x4(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shldi_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shldi_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshldvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    );
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shldi_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x16 = vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let shf: i16x16 = vpshldvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    );
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    transmute(vpshldvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shldi_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    let shf: i16x8 = vpshldvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let shf: i16x8 = vpshldvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    );
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
@@ -1914,4 +2415,590 @@ mod tests {
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_shldi_epi64(a, b, 2);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_mask_shldi_epi64(a, 0, a, b, 2);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi64(a, 0b11111111, a, b, 2);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(1 << 63);
+        let r = _mm512_maskz_shldi_epi64(0, a, b, 2);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi64(0b11111111, a, b, 2);
+        let e = _mm512_set1_epi64(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_shldi_epi64(a, b, 2);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_mask_shldi_epi64(a, 0, a, b, 2);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi64(a, 0b00001111, a, b, 2);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(1 << 63);
+        let r = _mm256_maskz_shldi_epi64(0, a, b, 2);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi64(0b00001111, a, b, 2);
+        let e = _mm256_set1_epi64x(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_shldi_epi64(a, b, 2);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_mask_shldi_epi64(a, 0, a, b, 2);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi64(a, 0b00000011, a, b, 2);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(1 << 63);
+        let r = _mm_maskz_shldi_epi64(0, a, b, 2);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi64(0b00000011, a, b, 2);
+        let e = _mm_set1_epi64x(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_shldi_epi32(a, b, 2);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_mask_shldi_epi32(a, 0, a, b, 2);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi32(a, 0b11111111_11111111, a, b, 2);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1 << 31);
+        let r = _mm512_maskz_shldi_epi32(0, a, b, 2);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi32(0b11111111_11111111, a, b, 2);
+        let e = _mm512_set1_epi32(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_shldi_epi32(a, b, 2);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_mask_shldi_epi32(a, 0, a, b, 2);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi32(a, 0b11111111, a, b, 2);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(1 << 31);
+        let r = _mm256_maskz_shldi_epi32(0, a, b, 2);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi32(0b11111111, a, b, 2);
+        let e = _mm256_set1_epi32(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_shldi_epi32(a, b, 2);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_mask_shldi_epi32(a, 0, a, b, 2);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi32(a, 0b00001111, a, b, 2);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(1 << 31);
+        let r = _mm_maskz_shldi_epi32(0, a, b, 2);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi32(0b00001111, a, b, 2);
+        let e = _mm_set1_epi32(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_shldi_epi16(a, b, 2);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_mask_shldi_epi16(a, 0, a, b, 2);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shldi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 2);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shldi_epi16() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1 << 15);
+        let r = _mm512_maskz_shldi_epi16(0, a, b, 2);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shldi_epi16(0b11111111_11111111_11111111_11111111, a, b, 2);
+        let e = _mm512_set1_epi16(6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_shldi_epi16(a, b, 2);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_mask_shldi_epi16(a, 0, a, b, 2);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shldi_epi16(a, 0b11111111_11111111, a, b, 2);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shldi_epi16() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1 << 15);
+        let r = _mm256_maskz_shldi_epi16(0, a, b, 2);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shldi_epi16(0b11111111_11111111, a, b, 2);
+        let e = _mm256_set1_epi16(6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_shldi_epi16(a, b, 2);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_mask_shldi_epi16(a, 0, a, b, 2);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shldi_epi16(a, 0b11111111, a, b, 2);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shldi_epi16() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1 << 15);
+        let r = _mm_maskz_shldi_epi16(0, a, b, 2);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shldi_epi16(0b11111111, a, b, 2);
+        let e = _mm_set1_epi16(6);
+        assert_eq_m128i(r, e);
+    }
+    /*
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_shrdv_epi64() {
+            let a = _mm512_set1_epi64(8);
+            let b = _mm512_set1_epi64(2);
+            let c = _mm512_set1_epi64(1);
+            let r = _mm512_shrdv_epi64(a, b, c);
+            let e = _mm512_set1_epi64(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_mask_shrdv_epi64() {
+            let a = _mm512_set1_epi64(8);
+            let b = _mm512_set1_epi64(2);
+            let c = _mm512_set1_epi64(1);
+            let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
+            assert_eq_m512i(r, a);
+            let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
+            let e = _mm512_set1_epi64(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_maskz_shrdv_epi64() {
+            let a = _mm512_set1_epi64(8);
+            let b = _mm512_set1_epi64(2);
+            let c = _mm512_set1_epi64(1);
+            let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
+            assert_eq_m512i(r, _mm512_setzero_si512());
+            let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
+            let e = _mm512_set1_epi64(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_shrdv_epi64() {
+            let a = _mm256_set1_epi64x(8);
+            let b = _mm256_set1_epi64x(2);
+            let c = _mm256_set1_epi64x(1);
+            let r = _mm256_shrdv_epi64(a, b, c);
+            let e = _mm256_set1_epi64x(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_mask_shrdv_epi64() {
+            let a = _mm256_set1_epi64x(8);
+            let b = _mm256_set1_epi64x(2);
+            let c = _mm256_set1_epi64x(1);
+            let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
+            assert_eq_m256i(r, a);
+            let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
+            let e = _mm256_set1_epi64x(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_maskz_shrdv_epi64() {
+            let a = _mm256_set1_epi64x(8);
+            let b = _mm256_set1_epi64x(2);
+            let c = _mm256_set1_epi64x(1);
+            let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
+            assert_eq_m256i(r, _mm256_setzero_si256());
+            let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
+            let e = _mm256_set1_epi64x(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_shrdv_epi64() {
+            let a = _mm_set1_epi64x(8);
+            let b = _mm_set1_epi64x(2);
+            let c = _mm_set1_epi64x(1);
+            let r = _mm_shrdv_epi64(a, b, c);
+            let e = _mm_set1_epi64x(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_mask_shrdv_epi64() {
+            let a = _mm_set1_epi64x(8);
+            let b = _mm_set1_epi64x(2);
+            let c = _mm_set1_epi64x(1);
+            let r = _mm_mask_shrdv_epi64(a, 0, b, c);
+            assert_eq_m128i(r, a);
+            let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
+            let e = _mm_set1_epi64x(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_maskz_shrdv_epi64() {
+            let a = _mm_set1_epi64x(8);
+            let b = _mm_set1_epi64x(2);
+            let c = _mm_set1_epi64x(1);
+            let r = _mm_maskz_shrdv_epi64(0, a, b, c);
+            assert_eq_m128i(r, _mm_setzero_si128());
+            let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
+            let e = _mm_set1_epi64x(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_shrdv_epi32() {
+            let a = _mm512_set1_epi32(8);
+            let b = _mm512_set1_epi32(2);
+            let c = _mm512_set1_epi32(1);
+            let r = _mm512_shrdv_epi32(a, b, c);
+            let e = _mm512_set1_epi32(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_mask_shrdv_epi32() {
+            let a = _mm512_set1_epi32(8);
+            let b = _mm512_set1_epi32(2);
+            let c = _mm512_set1_epi32(1);
+            let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
+            assert_eq_m512i(r, a);
+            let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
+            let e = _mm512_set1_epi32(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_maskz_shrdv_epi32() {
+            let a = _mm512_set1_epi32(8);
+            let b = _mm512_set1_epi32(2);
+            let c = _mm512_set1_epi32(1);
+            let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
+            assert_eq_m512i(r, _mm512_setzero_si512());
+            let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
+            let e = _mm512_set1_epi32(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_shrdv_epi32() {
+            let a = _mm256_set1_epi32(8);
+            let b = _mm256_set1_epi32(2);
+            let c = _mm256_set1_epi32(1);
+            let r = _mm256_shrdv_epi32(a, b, c);
+            let e = _mm256_set1_epi32(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_mask_shrdv_epi32() {
+            let a = _mm256_set1_epi32(8);
+            let b = _mm256_set1_epi32(2);
+            let c = _mm256_set1_epi32(1);
+            let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
+            assert_eq_m256i(r, a);
+            let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
+            let e = _mm256_set1_epi32(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_maskz_shrdv_epi32() {
+            let a = _mm256_set1_epi32(8);
+            let b = _mm256_set1_epi32(2);
+            let c = _mm256_set1_epi32(1);
+            let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
+            assert_eq_m256i(r, _mm256_setzero_si256());
+            let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
+            let e = _mm256_set1_epi32(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_shrdv_epi32() {
+            let a = _mm_set1_epi32(8);
+            let b = _mm_set1_epi32(2);
+            let c = _mm_set1_epi32(1);
+            let r = _mm_shrdv_epi32(a, b, c);
+            let e = _mm_set1_epi32(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_mask_shrdv_epi32() {
+            let a = _mm_set1_epi32(8);
+            let b = _mm_set1_epi32(2);
+            let c = _mm_set1_epi32(1);
+            let r = _mm_mask_shrdv_epi32(a, 0, b, c);
+            assert_eq_m128i(r, a);
+            let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
+            let e = _mm_set1_epi32(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_maskz_shrdv_epi32() {
+            let a = _mm_set1_epi32(8);
+            let b = _mm_set1_epi32(2);
+            let c = _mm_set1_epi32(1);
+            let r = _mm_maskz_shrdv_epi32(0, a, b, c);
+            assert_eq_m128i(r, _mm_setzero_si128());
+            let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
+            let e = _mm_set1_epi32(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_shrdv_epi16() {
+            let a = _mm512_set1_epi16(8);
+            let b = _mm512_set1_epi16(2);
+            let c = _mm512_set1_epi16(1);
+            let r = _mm512_shrdv_epi16(a, b, c);
+            let e = _mm512_set1_epi16(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_mask_shrdv_epi16() {
+            let a = _mm512_set1_epi16(8);
+            let b = _mm512_set1_epi16(2);
+            let c = _mm512_set1_epi16(1);
+            let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
+            assert_eq_m512i(r, a);
+            let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
+            let e = _mm512_set1_epi16(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2")]
+        unsafe fn test_mm512_maskz_shrdv_epi16() {
+            let a = _mm512_set1_epi16(8);
+            let b = _mm512_set1_epi16(2);
+            let c = _mm512_set1_epi16(1);
+            let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
+            assert_eq_m512i(r, _mm512_setzero_si512());
+            let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
+            let e = _mm512_set1_epi16(1);
+            assert_eq_m512i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_shrdv_epi16() {
+            let a = _mm256_set1_epi16(8);
+            let b = _mm256_set1_epi16(2);
+            let c = _mm256_set1_epi16(1);
+            let r = _mm256_shrdv_epi16(a, b, c);
+            let e = _mm256_set1_epi16(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_mask_shrdv_epi16() {
+            let a = _mm256_set1_epi16(8);
+            let b = _mm256_set1_epi16(2);
+            let c = _mm256_set1_epi16(1);
+            let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
+            assert_eq_m256i(r, a);
+            let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
+            let e = _mm256_set1_epi16(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm256_maskz_shrdv_epi16() {
+            let a = _mm256_set1_epi16(8);
+            let b = _mm256_set1_epi16(2);
+            let c = _mm256_set1_epi16(1);
+            let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
+            assert_eq_m256i(r, _mm256_setzero_si256());
+            let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
+            let e = _mm256_set1_epi16(1);
+            assert_eq_m256i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_shrdv_epi16() {
+            let a = _mm_set1_epi16(8);
+            let b = _mm_set1_epi16(2);
+            let c = _mm_set1_epi16(1);
+            let r = _mm_shrdv_epi16(a, b, c);
+            let e = _mm_set1_epi16(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_mask_shrdv_epi16() {
+            let a = _mm_set1_epi16(8);
+            let b = _mm_set1_epi16(2);
+            let c = _mm_set1_epi16(1);
+            let r = _mm_mask_shrdv_epi16(a, 0, b, c);
+            assert_eq_m128i(r, a);
+            let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
+            let e = _mm_set1_epi16(1);
+            assert_eq_m128i(r, e);
+        }
+
+        #[simd_test(enable = "avx512vbmi2,avx512vl")]
+        unsafe fn test_mm_maskz_shrdv_epi16() {
+            let a = _mm_set1_epi16(8);
+            let b = _mm_set1_epi16(2);
+            let c = _mm_set1_epi16(1);
+            let r = _mm_maskz_shrdv_epi16(0, a, b, c);
+            assert_eq_m128i(r, _mm_setzero_si128());
+            let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
+            let e = _mm_set1_epi16(1);
+            assert_eq_m128i(r, e);
+        }
+    */
 }

From 2ee39f3368e1c3c937289c6ec1c99a210410af04 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Sun, 10 Jan 2021 00:15:50 +0000
Subject: [PATCH 07/10] shrdi_epi64,epi32,epi16: mm512,mm256,mm

---
 crates/core_arch/avx512vbmi2.md         |  154 ++++
 crates/core_arch/src/x86/avx512vbmi2.rs | 1087 ++++++++++++++++-------
 2 files changed, 934 insertions(+), 307 deletions(-)
 create mode 100644 crates/core_arch/avx512vbmi2.md

diff --git a/crates/core_arch/avx512vbmi2.md b/crates/core_arch/avx512vbmi2.md
new file mode 100644
index 0000000000..43d497108e
--- /dev/null
+++ b/crates/core_arch/avx512vbmi2.md
@@ -0,0 +1,154 @@
+<summary>["AVX512_VBMI2"]</summary><p>
+
+  * [x] [`_mm_mask_compress_epi16`]
+  * [x] [`_mm_maskz_compress_epi16`]
+  * [x] [`_mm256_mask_compress_epi16`]
+  * [x] [`_mm256_maskz_compress_epi16`]
+  * [x] [`_mm512_mask_compress_epi16`]
+  * [x] [`_mm512_maskz_compress_epi16`]
+  * [x] [`_mm_mask_compress_epi8`]
+  * [x] [`_mm_maskz_compress_epi8`]
+  * [x] [`_mm256_mask_compress_epi8`]
+  * [x] [`_mm256_maskz_compress_epi8`]
+  * [x] [`_mm512_mask_compress_epi8`]
+  * [x] [`_mm512_maskz_compress_epi8`]
+  * [_] [`_mm_mask_compressstoreu_epi16`]
+  * [_] [`_mm256_mask_compressstoreu_epi16`]
+  * [_] [`_mm512_mask_compressstoreu_epi16`]
+  * [_] [`_mm_mask_compressstoreu_epi8`]
+  * [_] [`_mm256_mask_compressstoreu_epi8`]
+  * [_] [`_mm512_mask_compressstoreu_epi8`]
+  * [x] [`_mm_mask_expand_epi16`]
+  * [x] [`_mm_maskz_expand_epi16`]
+  * [x] [`_mm256_mask_expand_epi16`]
+  * [x] [`_mm256_maskz_expand_epi16`]
+  * [x] [`_mm512_mask_expand_epi16`]
+  * [x] [`_mm512_maskz_expand_epi16`]
+  * [x] [`_mm_mask_expand_epi8`]
+  * [x] [`_mm_maskz_expand_epi8`]
+  * [x] [`_mm256_mask_expand_epi8`]
+  * [x] [`_mm256_maskz_expand_epi8`]
+  * [x] [`_mm512_mask_expand_epi8`]
+  * [x] [`_mm512_maskz_expand_epi8`]
+  * [_] [`_mm_mask_expandloadu_epi16`]
+  * [_] [`_mm_maskz_expandloadu_epi16`]
+  * [_] [`_mm256_mask_expandloadu_epi16`]
+  * [_] [`_mm256_maskz_expandloadu_epi16`]
+  * [_] [`_mm512_mask_expandloadu_epi16`]
+  * [_] [`_mm512_maskz_expandloadu_epi16`]
+  * [_] [`_mm_mask_expandloadu_epi8`]
+  * [_] [`_mm_maskz_expandloadu_epi8`]
+  * [_] [`_mm256_mask_expandloadu_epi8`]
+  * [_] [`_mm256_maskz_expandloadu_epi8`]
+  * [_] [`_mm512_mask_expandloadu_epi8`]
+  * [_] [`_mm512_maskz_expandloadu_epi8`]
+  * [x] [`_mm_mask_shldi_epi16`]
+  * [x] [`_mm_maskz_shldi_epi16`]
+  * [x] [`_mm_shldi_epi16`]
+  * [x] [`_mm256_mask_shldi_epi16`]
+  * [x] [`_mm256_maskz_shldi_epi16`]
+  * [x] [`_mm256_shldi_epi16`]
+  * [x] [`_mm512_mask_shldi_epi16`]
+  * [x] [`_mm512_maskz_shldi_epi16`]
+  * [x] [`_mm512_shldi_epi16`]
+  * [x] [`_mm_mask_shldi_epi32`]
+  * [x] [`_mm_maskz_shldi_epi32`]
+  * [x] [`_mm_shldi_epi32`]
+  * [x] [`_mm256_mask_shldi_epi32`]
+  * [x] [`_mm256_maskz_shldi_epi32`]
+  * [x] [`_mm256_shldi_epi32`]
+  * [x] [`_mm512_mask_shldi_epi32`]
+  * [x] [`_mm512_maskz_shldi_epi32`]
+  * [x] [`_mm512_shldi_epi32`]
+  * [x] [`_mm_mask_shldi_epi64`]
+  * [x] [`_mm_maskz_shldi_epi64`]
+  * [x] [`_mm_shldi_epi64`]
+  * [x] [`_mm256_mask_shldi_epi64`]
+  * [x] [`_mm256_maskz_shldi_epi64`]
+  * [x] [`_mm256_shldi_epi64`]
+  * [x] [`_mm512_mask_shldi_epi64`]
+  * [x] [`_mm512_maskz_shldi_epi64`]
+  * [x] [`_mm512_shldi_epi64`]
+  * [x] [`_mm_mask_shldv_epi16`]
+  * [x] [`_mm_maskz_shldv_epi16`]
+  * [x] [`_mm_shldv_epi16`]
+  * [x] [`_mm256_mask_shldv_epi16`]
+  * [x] [`_mm256_maskz_shldv_epi16`]
+  * [x] [`_mm256_shldv_epi16`]
+  * [x] [`_mm512_mask_shldv_epi16`]
+  * [x] [`_mm512_maskz_shldv_epi16`]
+  * [x] [`_mm512_shldv_epi16`]
+  * [x] [`_mm_mask_shldv_epi32`]
+  * [x] [`_mm_maskz_shldv_epi32`]
+  * [x] [`_mm_shldv_epi32`]
+  * [x] [`_mm256_mask_shldv_epi32`]
+  * [x] [`_mm256_maskz_shldv_epi32`]
+  * [x] [`_mm256_shldv_epi32`]
+  * [x] [`_mm512_mask_shldv_epi32`]
+  * [x] [`_mm512_maskz_shldv_epi32`]
+  * [x] [`_mm512_shldv_epi32`]
+  * [x] [`_mm_mask_shldv_epi64`]
+  * [x] [`_mm_maskz_shldv_epi64`]
+  * [x] [`_mm_shldv_epi64`]
+  * [x] [`_mm256_mask_shldv_epi64`]
+  * [x] [`_mm256_maskz_shldv_epi64`]
+  * [x] [`_mm256_shldv_epi64`]
+  * [x] [`_mm512_mask_shldv_epi64`]
+  * [x] [`_mm512_maskz_shldv_epi64`]
+  * [x] [`_mm512_shldv_epi64`]
+  * [x] [`_mm_mask_shrdi_epi16`]
+  * [x] [`_mm_maskz_shrdi_epi16`]
+  * [x] [`_mm_shrdi_epi16`]
+  * [x] [`_mm256_mask_shrdi_epi16`]
+  * [x] [`_mm256_maskz_shrdi_epi16`]
+  * [x] [`_mm256_shrdi_epi16`]
+  * [x] [`_mm512_mask_shrdi_epi16`]
+  * [x] [`_mm512_maskz_shrdi_epi16`]
+  * [x] [`_mm512_shrdi_epi16`]
+  * [x] [`_mm_mask_shrdi_epi32`]
+  * [x] [`_mm_maskz_shrdi_epi32`]
+  * [x] [`_mm_shrdi_epi32`]
+  * [x] [`_mm256_mask_shrdi_epi32`]
+  * [x] [`_mm256_maskz_shrdi_epi32`]
+  * [x] [`_mm256_shrdi_epi32`]
+  * [x] [`_mm512_mask_shrdi_epi32`]
+  * [x] [`_mm512_maskz_shrdi_epi32`]
+  * [x] [`_mm512_shrdi_epi32`]
+  * [x] [`_mm_mask_shrdi_epi64`]
+  * [x] [`_mm_maskz_shrdi_epi64`]
+  * [x] [`_mm_shrdi_epi64`]
+  * [x] [`_mm256_mask_shrdi_epi64`]
+  * [x] [`_mm256_maskz_shrdi_epi64`]
+  * [x] [`_mm256_shrdi_epi64`]
+  * [x] [`_mm512_mask_shrdi_epi64`]
+  * [x] [`_mm512_maskz_shrdi_epi64`]
+  * [x] [`_mm512_shrdi_epi64`]
+  * [x] [`_mm_mask_shrdv_epi16`]
+  * [x] [`_mm_maskz_shrdv_epi16`]
+  * [x] [`_mm_shrdv_epi16`]
+  * [x] [`_mm256_mask_shrdv_epi16`]
+  * [x] [`_mm256_maskz_shrdv_epi16`]
+  * [x] [`_mm256_shrdv_epi16`]
+  * [x] [`_mm512_mask_shrdv_epi16`]
+  * [x] [`_mm512_maskz_shrdv_epi16`]
+  * [x] [`_mm512_shrdv_epi16`]
+  * [x] [`_mm_mask_shrdv_epi32`]
+  * [x] [`_mm_maskz_shrdv_epi32`]
+  * [x] [`_mm_shrdv_epi32`]
+  * [x] [`_mm256_mask_shrdv_epi32`]
+  * [x] [`_mm256_maskz_shrdv_epi32`]
+  * [x] [`_mm256_shrdv_epi32`]
+  * [x] [`_mm512_mask_shrdv_epi32`]
+  * [x] [`_mm512_maskz_shrdv_epi32`]
+  * [x] [`_mm512_shrdv_epi32`]
+  * [x] [`_mm_mask_shrdv_epi64`]
+  * [x] [`_mm_maskz_shrdv_epi64`]
+  * [x] [`_mm_shrdv_epi64`]
+  * [x] [`_mm256_mask_shrdv_epi64`]
+  * [x] [`_mm256_maskz_shrdv_epi64`]
+  * [x] [`_mm256_shrdv_epi64`]
+  * [x] [`_mm512_mask_shrdv_epi64`]
+  * [x] [`_mm512_maskz_shrdv_epi64`]
+  * [x] [`_mm512_shrdv_epi64`]
+
+</p>
diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 78091daa91..032bce9176 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -1416,6 +1416,507 @@ pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i
     transmute(simd_select_bitmask(k, shf, zero))
 }
 
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shrdi_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x8 = vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 255))] //should be vpshrdq
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x8 = vpshrdvq(
+        a.as_i64x8(),
+        b.as_i64x8(),
+        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+    );
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shrdi_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x4 = vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x4 = vpshrdvq256(
+        a.as_i64x4(),
+        b.as_i64x4(),
+        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+    );
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    ))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shrdi_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x2 = vpshrdvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+}
+
+/// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i64x2 = vpshrdvq128(
+        a.as_i64x2(),
+        b.as_i64x2(),
+        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+    );
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shrdi_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x16 = vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x16 = vpshrdvd(
+        a.as_i32x16(),
+        b.as_i32x16(),
+        _mm512_set1_epi32(imm8).as_i32x16(),
+    );
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shrdi_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x8 = vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x8 = vpshrdvd256(
+        a.as_i32x8(),
+        b.as_i32x8(),
+        _mm256_set1_epi32(imm8).as_i32x8(),
+    );
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvd128(
+        a.as_i32x4(),
+        b.as_i32x4(),
+        _mm_set1_epi32(imm8).as_i32x4(),
+    ))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shrdi_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+}
+
+/// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_shrdi_epi16(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095)
+#[inline]
+#[target_feature(enable = "avx512vbmi2")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x32 = vpshrdvw(
+        a.as_i16x32(),
+        b.as_i16x32(),
+        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+    );
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    transmute(vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shrdi_epi16(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let shf: i16x16 = vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    let shf: i16x16 = vpshrdvw256(
+        a.as_i16x16(),
+        b.as_i16x16(),
+        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+    );
+    let zero = _mm256_setzero_si256().as_i16x16();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    transmute(vpshrdvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    ))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shrdi_epi16(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    let shf: i16x8 = vpshrdvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    );
+    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+}
+
+/// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089)
+#[inline]
+#[target_feature(enable = "avx512vbmi2,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shrdi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    let shf: i16x8 = vpshrdvw128(
+        a.as_i16x8(),
+        b.as_i16x8(),
+        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+    );
+    let zero = _mm_setzero_si128().as_i16x8();
+    transmute(simd_select_bitmask(k, shf, zero))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
@@ -2694,311 +3195,283 @@ mod tests {
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
-    /*
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_shrdv_epi64() {
-            let a = _mm512_set1_epi64(8);
-            let b = _mm512_set1_epi64(2);
-            let c = _mm512_set1_epi64(1);
-            let r = _mm512_shrdv_epi64(a, b, c);
-            let e = _mm512_set1_epi64(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_mask_shrdv_epi64() {
-            let a = _mm512_set1_epi64(8);
-            let b = _mm512_set1_epi64(2);
-            let c = _mm512_set1_epi64(1);
-            let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
-            assert_eq_m512i(r, a);
-            let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
-            let e = _mm512_set1_epi64(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_maskz_shrdv_epi64() {
-            let a = _mm512_set1_epi64(8);
-            let b = _mm512_set1_epi64(2);
-            let c = _mm512_set1_epi64(1);
-            let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
-            assert_eq_m512i(r, _mm512_setzero_si512());
-            let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
-            let e = _mm512_set1_epi64(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_shrdv_epi64() {
-            let a = _mm256_set1_epi64x(8);
-            let b = _mm256_set1_epi64x(2);
-            let c = _mm256_set1_epi64x(1);
-            let r = _mm256_shrdv_epi64(a, b, c);
-            let e = _mm256_set1_epi64x(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_mask_shrdv_epi64() {
-            let a = _mm256_set1_epi64x(8);
-            let b = _mm256_set1_epi64x(2);
-            let c = _mm256_set1_epi64x(1);
-            let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
-            assert_eq_m256i(r, a);
-            let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
-            let e = _mm256_set1_epi64x(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_maskz_shrdv_epi64() {
-            let a = _mm256_set1_epi64x(8);
-            let b = _mm256_set1_epi64x(2);
-            let c = _mm256_set1_epi64x(1);
-            let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
-            assert_eq_m256i(r, _mm256_setzero_si256());
-            let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
-            let e = _mm256_set1_epi64x(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_shrdv_epi64() {
-            let a = _mm_set1_epi64x(8);
-            let b = _mm_set1_epi64x(2);
-            let c = _mm_set1_epi64x(1);
-            let r = _mm_shrdv_epi64(a, b, c);
-            let e = _mm_set1_epi64x(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_mask_shrdv_epi64() {
-            let a = _mm_set1_epi64x(8);
-            let b = _mm_set1_epi64x(2);
-            let c = _mm_set1_epi64x(1);
-            let r = _mm_mask_shrdv_epi64(a, 0, b, c);
-            assert_eq_m128i(r, a);
-            let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
-            let e = _mm_set1_epi64x(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_maskz_shrdv_epi64() {
-            let a = _mm_set1_epi64x(8);
-            let b = _mm_set1_epi64x(2);
-            let c = _mm_set1_epi64x(1);
-            let r = _mm_maskz_shrdv_epi64(0, a, b, c);
-            assert_eq_m128i(r, _mm_setzero_si128());
-            let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
-            let e = _mm_set1_epi64x(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_shrdv_epi32() {
-            let a = _mm512_set1_epi32(8);
-            let b = _mm512_set1_epi32(2);
-            let c = _mm512_set1_epi32(1);
-            let r = _mm512_shrdv_epi32(a, b, c);
-            let e = _mm512_set1_epi32(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_mask_shrdv_epi32() {
-            let a = _mm512_set1_epi32(8);
-            let b = _mm512_set1_epi32(2);
-            let c = _mm512_set1_epi32(1);
-            let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
-            assert_eq_m512i(r, a);
-            let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
-            let e = _mm512_set1_epi32(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_maskz_shrdv_epi32() {
-            let a = _mm512_set1_epi32(8);
-            let b = _mm512_set1_epi32(2);
-            let c = _mm512_set1_epi32(1);
-            let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
-            assert_eq_m512i(r, _mm512_setzero_si512());
-            let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
-            let e = _mm512_set1_epi32(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_shrdv_epi32() {
-            let a = _mm256_set1_epi32(8);
-            let b = _mm256_set1_epi32(2);
-            let c = _mm256_set1_epi32(1);
-            let r = _mm256_shrdv_epi32(a, b, c);
-            let e = _mm256_set1_epi32(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_mask_shrdv_epi32() {
-            let a = _mm256_set1_epi32(8);
-            let b = _mm256_set1_epi32(2);
-            let c = _mm256_set1_epi32(1);
-            let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
-            assert_eq_m256i(r, a);
-            let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
-            let e = _mm256_set1_epi32(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_maskz_shrdv_epi32() {
-            let a = _mm256_set1_epi32(8);
-            let b = _mm256_set1_epi32(2);
-            let c = _mm256_set1_epi32(1);
-            let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
-            assert_eq_m256i(r, _mm256_setzero_si256());
-            let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
-            let e = _mm256_set1_epi32(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_shrdv_epi32() {
-            let a = _mm_set1_epi32(8);
-            let b = _mm_set1_epi32(2);
-            let c = _mm_set1_epi32(1);
-            let r = _mm_shrdv_epi32(a, b, c);
-            let e = _mm_set1_epi32(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_mask_shrdv_epi32() {
-            let a = _mm_set1_epi32(8);
-            let b = _mm_set1_epi32(2);
-            let c = _mm_set1_epi32(1);
-            let r = _mm_mask_shrdv_epi32(a, 0, b, c);
-            assert_eq_m128i(r, a);
-            let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
-            let e = _mm_set1_epi32(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_maskz_shrdv_epi32() {
-            let a = _mm_set1_epi32(8);
-            let b = _mm_set1_epi32(2);
-            let c = _mm_set1_epi32(1);
-            let r = _mm_maskz_shrdv_epi32(0, a, b, c);
-            assert_eq_m128i(r, _mm_setzero_si128());
-            let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
-            let e = _mm_set1_epi32(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_shrdv_epi16() {
-            let a = _mm512_set1_epi16(8);
-            let b = _mm512_set1_epi16(2);
-            let c = _mm512_set1_epi16(1);
-            let r = _mm512_shrdv_epi16(a, b, c);
-            let e = _mm512_set1_epi16(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_mask_shrdv_epi16() {
-            let a = _mm512_set1_epi16(8);
-            let b = _mm512_set1_epi16(2);
-            let c = _mm512_set1_epi16(1);
-            let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
-            assert_eq_m512i(r, a);
-            let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
-            let e = _mm512_set1_epi16(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2")]
-        unsafe fn test_mm512_maskz_shrdv_epi16() {
-            let a = _mm512_set1_epi16(8);
-            let b = _mm512_set1_epi16(2);
-            let c = _mm512_set1_epi16(1);
-            let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
-            assert_eq_m512i(r, _mm512_setzero_si512());
-            let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
-            let e = _mm512_set1_epi16(1);
-            assert_eq_m512i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_shrdv_epi16() {
-            let a = _mm256_set1_epi16(8);
-            let b = _mm256_set1_epi16(2);
-            let c = _mm256_set1_epi16(1);
-            let r = _mm256_shrdv_epi16(a, b, c);
-            let e = _mm256_set1_epi16(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_mask_shrdv_epi16() {
-            let a = _mm256_set1_epi16(8);
-            let b = _mm256_set1_epi16(2);
-            let c = _mm256_set1_epi16(1);
-            let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
-            assert_eq_m256i(r, a);
-            let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
-            let e = _mm256_set1_epi16(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm256_maskz_shrdv_epi16() {
-            let a = _mm256_set1_epi16(8);
-            let b = _mm256_set1_epi16(2);
-            let c = _mm256_set1_epi16(1);
-            let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
-            assert_eq_m256i(r, _mm256_setzero_si256());
-            let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
-            let e = _mm256_set1_epi16(1);
-            assert_eq_m256i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_shrdv_epi16() {
-            let a = _mm_set1_epi16(8);
-            let b = _mm_set1_epi16(2);
-            let c = _mm_set1_epi16(1);
-            let r = _mm_shrdv_epi16(a, b, c);
-            let e = _mm_set1_epi16(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_mask_shrdv_epi16() {
-            let a = _mm_set1_epi16(8);
-            let b = _mm_set1_epi16(2);
-            let c = _mm_set1_epi16(1);
-            let r = _mm_mask_shrdv_epi16(a, 0, b, c);
-            assert_eq_m128i(r, a);
-            let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
-            let e = _mm_set1_epi16(1);
-            assert_eq_m128i(r, e);
-        }
-
-        #[simd_test(enable = "avx512vbmi2,avx512vl")]
-        unsafe fn test_mm_maskz_shrdv_epi16() {
-            let a = _mm_set1_epi16(8);
-            let b = _mm_set1_epi16(2);
-            let c = _mm_set1_epi16(1);
-            let r = _mm_maskz_shrdv_epi16(0, a, b, c);
-            assert_eq_m128i(r, _mm_setzero_si128());
-            let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
-            let e = _mm_set1_epi16(1);
-            assert_eq_m128i(r, e);
-        }
-    */
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_shrdi_epi64(a, b, 1);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_shrdi_epi64(a, 0, a, b, 1);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi64(a, 0b11111111, a, b, 1);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi64() {
+        let a = _mm512_set1_epi64(8);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_shrdi_epi64(0, a, b, 1);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi64(0b11111111, a, b, 1);
+        let e = _mm512_set1_epi64(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_shrdi_epi64(a, b, 1);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_shrdi_epi64(a, 0, a, b, 1);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi64(a, 0b00001111, a, b, 1);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi64() {
+        let a = _mm256_set1_epi64x(8);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_maskz_shrdi_epi64(0, a, b, 1);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi64(0b00001111, a, b, 1);
+        let e = _mm256_set1_epi64x(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_shrdi_epi64(a, b, 1);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_mask_shrdi_epi64(a, 0, a, b, 1);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi64(a, 0b00000011, a, b, 1);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi64() {
+        let a = _mm_set1_epi64x(8);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_maskz_shrdi_epi64(0, a, b, 1);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi64(0b00000011, a, b, 1);
+        let e = _mm_set1_epi64x(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_shrdi_epi32(a, b, 1);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_shrdi_epi32(a, 0, a, b, 1);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi32(a, 0b11111111_11111111, a, b, 1);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi32() {
+        let a = _mm512_set1_epi32(8);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_shrdi_epi32(0, a, b, 1);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi32(0b11111111_11111111, a, b, 1);
+        let e = _mm512_set1_epi32(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_shrdi_epi32(a, b, 1);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_shrdi_epi32(a, 0, a, b, 1);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi32(a, 0b11111111, a, b, 1);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi32() {
+        let a = _mm256_set1_epi32(8);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_maskz_shrdi_epi32(0, a, b, 1);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi32(0b11111111, a, b, 1);
+        let e = _mm256_set1_epi32(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_shrdi_epi32(a, b, 1);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_shrdi_epi32(a, 0, a, b, 1);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi32(a, 0b00001111, a, b, 1);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi32() {
+        let a = _mm_set1_epi32(8);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_maskz_shrdi_epi32(0, a, b, 1);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi32(0b00001111, a, b, 1);
+        let e = _mm_set1_epi32(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_shrdi_epi16(a, b, 1);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_mask_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_mask_shrdi_epi16(a, 0, a, b, 1);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_shrdi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 1);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2")]
+    unsafe fn test_mm512_maskz_shrdi_epi16() {
+        let a = _mm512_set1_epi16(8);
+        let b = _mm512_set1_epi16(2);
+        let r = _mm512_maskz_shrdi_epi16(0, a, b, 1);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_shrdi_epi16(0b11111111_11111111_11111111_11111111, a, b, 1);
+        let e = _mm512_set1_epi16(1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_shrdi_epi16(a, b, 1);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_mask_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_mask_shrdi_epi16(a, 0, a, b, 1);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shrdi_epi16(a, 0b11111111_11111111, a, b, 1);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm256_maskz_shrdi_epi16() {
+        let a = _mm256_set1_epi16(8);
+        let b = _mm256_set1_epi16(2);
+        let r = _mm256_maskz_shrdi_epi16(0, a, b, 1);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shrdi_epi16(0b11111111_11111111, a, b, 1);
+        let e = _mm256_set1_epi16(1);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_shrdi_epi16(a, b, 1);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_mask_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_mask_shrdi_epi16(a, 0, a, b, 1);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shrdi_epi16(a, 0b11111111, a, b, 1);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vbmi2,avx512vl")]
+    unsafe fn test_mm_maskz_shrdi_epi16() {
+        let a = _mm_set1_epi16(8);
+        let b = _mm_set1_epi16(2);
+        let r = _mm_maskz_shrdi_epi16(0, a, b, 1);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shrdi_epi16(0b11111111, a, b, 1);
+        let e = _mm_set1_epi16(1);
+        assert_eq_m128i(r, e);
+    }
 }

From 0b42b6eb4c6aae70e99a2f074991f6d934aed4cc Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 11 Jan 2021 14:35:48 +0000
Subject: [PATCH 08/10] test1

---
 crates/core_arch/avx512vbmi2.md               |   1 -
 crates/core_arch/src/x86/avx512vnni.rs        | 849 ++++++++++++++++++
 .../core_arch/src/x86/avx512vp2intersect.rs   |  43 +
 3 files changed, 892 insertions(+), 1 deletion(-)
 create mode 100644 crates/core_arch/src/x86/avx512vnni.rs
 create mode 100644 crates/core_arch/src/x86/avx512vp2intersect.rs

diff --git a/crates/core_arch/avx512vbmi2.md b/crates/core_arch/avx512vbmi2.md
index 43d497108e..4bb6a0ed0c 100644
--- a/crates/core_arch/avx512vbmi2.md
+++ b/crates/core_arch/avx512vbmi2.md
@@ -150,5 +150,4 @@
   * [x] [`_mm512_mask_shrdv_epi64`]
   * [x] [`_mm512_maskz_shrdv_epi64`]
   * [x] [`_mm512_shrdv_epi64`]
-
 </p>
diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs
new file mode 100644
index 0000000000..daa3c896a6
--- /dev/null
+++ b/crates/core_arch/src/x86/avx512vnni.rs
@@ -0,0 +1,849 @@
+use crate::{
+    core_arch::{simd::*, simd_llvm::*, x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssd_epi32&expand=2219)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssd_epi32&expand=2220)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssd_epi32&expand=2221)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_epi32&expand=2216)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssd_epi32&expand=2217)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssd_epi32&expand=2218)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_epi32&expand=2213)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssd_epi32&expand=2214)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssd_epi32&expand=2215)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssd))]
+pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssds_epi32&expand=2228)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssds_epi32&expand=2229)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssds_epi32&expand=2230)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_epi32&expand=2225)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssds_epi32&expand=2226)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssds_epi32&expand=2227)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_epi32&expand=2222)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssds_epi32&expand=2223)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssds_epi32&expand=2224)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpwssds))]
+pub unsafe fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusd_epi32&expand=2201)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusd_epi32&expand=2202)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusd_epi32&expand=2203)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_epi32&expand=2198)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusd_epi32&expand=2199)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusd_epi32&expand=2200)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_epi32&expand=2195)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusd_epi32&expand=2196)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusd_epi32&expand=2197)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusd))]
+pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusds_epi32&expand=2210)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusds_epi32&expand=2211)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusds_epi32&expand=2212)
+#[inline]
+#[target_feature(enable = "avx512vnni")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_epi32&expand=2207)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusds_epi32&expand=2208)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusds_epi32&expand=2209)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_epi32&expand=2204)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusds_epi32&expand=2205)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusds_epi32&expand=2206)
+#[inline]
+#[target_feature(enable = "avx512vnni,avx512vl")]
+#[cfg_attr(test, assert_instr(vpdpbusds))]
+pub unsafe fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
+    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
+    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
+    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
+    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
+    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
+    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
+    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
+    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
+    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
+    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
+    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
+    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_dpwssd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_dpwssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_dpwssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_dpwssds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpwssds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<16|1<<0);
+        let b = _mm512_set1_epi32(1<<16|1<<0);
+        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(3);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_dpwssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpwssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<16|1<<0);
+        let b = _mm256_set1_epi32(1<<16|1<<0);
+        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_dpwssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpwssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<16|1<<0);
+        let b = _mm_set1_epi32(1<<16|1<<0);
+        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_dpbusd_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusd_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_dpbusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_dpbusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_dpbusds_epi32(src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_mask_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni")]
+    unsafe fn test_mm512_maskz_dpbusds_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
+        let e = _mm512_set1_epi32(5);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_dpbusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_mask_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm256_maskz_dpbusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_dpbusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_mask_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512vnni,avx512vl")]
+    unsafe fn test_mm_maskz_dpbusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
+        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+}
diff --git a/crates/core_arch/src/x86/avx512vp2intersect.rs b/crates/core_arch/src/x86/avx512vp2intersect.rs
new file mode 100644
index 0000000000..211f0b25dc
--- /dev/null
+++ b/crates/core_arch/src/x86/avx512vp2intersect.rs
@@ -0,0 +1,43 @@
+use crate::{
+    core_arch::{simd::*, /*simd_llvm::*,*/ x86::*},
+    mem::transmute,
+};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi32&expand=2)
+#[inline]
+#[target_feature(enable = "avx512vp2intersect,avx512f")]
+#[cfg_attr(test, assert_instr(vp2intersectd))]
+pub unsafe fn _mm512_2intersect_epi32(a: __m512i, b: __m512i, k1: *mut u16, k2: *mut u16) {
+    transmute(vp2intersectd(a.as_i32x16(), b.as_i32x16(), k1, k2))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.vp2intersect.d.512"]
+    fn vp2intersectd(a: i32x16, b: i32x16, k1: *mut u16, k2: *mut u16);
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx512vp2intersect,avx512f")]
+    unsafe fn test_mm512_2intersect_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(1);
+        let mut r1: u16 = 0;
+        let mut r2: u16 = 0;
+        _mm512_2intersect_epi32(a, b, &mut r1 as *mut _ as *mut u16, &mut r2 as *mut _ as *mut u16);
+        //assert_eq!(r1, 0b11111111_11111111);
+        //assert_eq!(r2, 0b11111111_11111111);
+        assert_eq!(r1, 0);
+        assert_eq!(r2, 0);
+    }
+}

From 3ee3e64ce16cc1461e290e95682a3403df938457 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 11 Jan 2021 14:36:28 +0000
Subject: [PATCH 09/10] test1

---
 crates/core_arch/src/x86/avx512vnni.rs        | 849 ------------------
 .../core_arch/src/x86/avx512vp2intersect.rs   |  43 -
 2 files changed, 892 deletions(-)
 delete mode 100644 crates/core_arch/src/x86/avx512vnni.rs
 delete mode 100644 crates/core_arch/src/x86/avx512vp2intersect.rs

diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs
deleted file mode 100644
index daa3c896a6..0000000000
--- a/crates/core_arch/src/x86/avx512vnni.rs
+++ /dev/null
@@ -1,849 +0,0 @@
-use crate::{
-    core_arch::{simd::*, simd_llvm::*, x86::*},
-    mem::transmute,
-};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssd_epi32&expand=2219)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssd_epi32&expand=2220)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssd_epi32&expand=2221)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_epi32&expand=2216)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssd_epi32&expand=2217)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssd_epi32&expand=2218)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_epi32&expand=2213)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssd_epi32&expand=2214)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssd_epi32&expand=2215)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpwssds_epi32&expand=2228)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpwssds_epi32&expand=2229)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpwssds_epi32&expand=2230)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_epi32&expand=2225)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpwssds_epi32&expand=2226)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpwssds_epi32&expand=2227)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_epi32&expand=2222)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpwssds_epi32&expand=2223)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpwssds_epi32&expand=2224)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusd_epi32&expand=2201)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusd_epi32&expand=2202)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusd_epi32&expand=2203)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_epi32&expand=2198)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusd_epi32&expand=2199)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusd_epi32&expand=2200)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_epi32&expand=2195)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusd_epi32&expand=2196)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusd_epi32&expand=2197)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dpbusds_epi32&expand=2210)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dpbusds_epi32&expand=2211)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dpbusds_epi32&expand=2212)
-#[inline]
-#[target_feature(enable = "avx512vnni")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_epi32&expand=2207)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dpbusds_epi32&expand=2208)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dpbusds_epi32&expand=2209)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_epi32&expand=2204)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dpbusds_epi32&expand=2205)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dpbusds_epi32&expand=2206)
-#[inline]
-#[target_feature(enable = "avx512vnni,avx512vl")]
-#[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
-}
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
-    fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpwssd.256"]
-    fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpwssd.128"]
-    fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpwssds.512"]
-    fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpwssds.256"]
-    fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpwssds.128"]
-    fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpbusd.512"]
-    fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpbusd.256"]
-    fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpbusd.128"]
-    fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-
-    #[link_name = "llvm.x86.avx512.vpdpbusds.512"]
-    fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16;
-    #[link_name = "llvm.x86.avx512.vpdpbusds.256"]
-    fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
-    fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
-}
-
-#[cfg(test)]
-mod tests {
-
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_dpwssd_epi32(src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpwssd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_dpwssd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpwssd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_dpwssd_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpwssd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_dpwssds_epi32(src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpwssds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<16|1<<0);
-        let b = _mm512_set1_epi32(1<<16|1<<0);
-        let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(3);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_dpwssds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpwssds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<16|1<<0);
-        let b = _mm256_set1_epi32(1<<16|1<<0);
-        let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(3);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_dpwssds_epi32(src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpwssds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<16|1<<0);
-        let b = _mm_set1_epi32(1<<16|1<<0);
-        let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(3);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_dpbusd_epi32(src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpbusd_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_dpbusd_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpbusd_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_dpbusd_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpbusd_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_dpbusds_epi32(src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_mask_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b);
-        assert_eq_m512i(r, src);
-        let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni")]
-    unsafe fn test_mm512_maskz_dpbusds_epi32() {
-        let src = _mm512_set1_epi32(1);
-        let a = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm512_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b);
-        assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b);
-        let e = _mm512_set1_epi32(5);
-        assert_eq_m512i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_dpbusds_epi32(src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_mask_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b);
-        assert_eq_m256i(r, src);
-        let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm256_maskz_dpbusds_epi32() {
-        let src = _mm256_set1_epi32(1);
-        let a = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm256_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b);
-        assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b);
-        let e = _mm256_set1_epi32(5);
-        assert_eq_m256i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_dpbusds_epi32(src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_mask_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b);
-        assert_eq_m128i(r, src);
-        let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-
-    #[simd_test(enable = "avx512vnni,avx512vl")]
-    unsafe fn test_mm_maskz_dpbusds_epi32() {
-        let src = _mm_set1_epi32(1);
-        let a = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let b = _mm_set1_epi32(1<<24|1<<16|1<<8|1<<0);
-        let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b);
-        assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b);
-        let e = _mm_set1_epi32(5);
-        assert_eq_m128i(r, e);
-    }
-}
diff --git a/crates/core_arch/src/x86/avx512vp2intersect.rs b/crates/core_arch/src/x86/avx512vp2intersect.rs
deleted file mode 100644
index 211f0b25dc..0000000000
--- a/crates/core_arch/src/x86/avx512vp2intersect.rs
+++ /dev/null
@@ -1,43 +0,0 @@
-use crate::{
-    core_arch::{simd::*, /*simd_llvm::*,*/ x86::*},
-    mem::transmute,
-};
-
-#[cfg(test)]
-use stdarch_test::assert_instr;
-
-/// Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_2intersect_epi32&expand=2)
-#[inline]
-#[target_feature(enable = "avx512vp2intersect,avx512f")]
-#[cfg_attr(test, assert_instr(vp2intersectd))]
-pub unsafe fn _mm512_2intersect_epi32(a: __m512i, b: __m512i, k1: *mut u16, k2: *mut u16) {
-    transmute(vp2intersectd(a.as_i32x16(), b.as_i32x16(), k1, k2))
-}
-
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.avx512.vp2intersect.d.512"]
-    fn vp2intersectd(a: i32x16, b: i32x16, k1: *mut u16, k2: *mut u16);
-}
-
-#[cfg(test)]
-mod tests {
-
-    use crate::core_arch::x86::*;
-    use stdarch_test::simd_test;
-
-    #[simd_test(enable = "avx512vp2intersect,avx512f")]
-    unsafe fn test_mm512_2intersect_epi32() {
-        let a = _mm512_set1_epi32(1);
-        let b = _mm512_set1_epi32(1);
-        let mut r1: u16 = 0;
-        let mut r2: u16 = 0;
-        _mm512_2intersect_epi32(a, b, &mut r1 as *mut _ as *mut u16, &mut r2 as *mut _ as *mut u16);
-        //assert_eq!(r1, 0b11111111_11111111);
-        //assert_eq!(r2, 0b11111111_11111111);
-        assert_eq!(r1, 0);
-        assert_eq!(r2, 0);
-    }
-}

From fd3a1aea9af28bb51156855b0303ee12f9a11c53 Mon Sep 17 00:00:00 2001
From: jirong <jironglin@gmail.com>
Date: Mon, 11 Jan 2021 15:54:51 +0000
Subject: [PATCH 10/10] remove wasm32 ci

---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index fdae25c903..ee75f8f022 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         - mips64-unknown-linux-gnuabi64
         - mips64el-unknown-linux-gnuabi64
         - s390x-unknown-linux-gnu
-        - wasm32-wasi
+        #- wasm32-wasi
         - i586-unknown-linux-gnu
         - x86_64-linux-android
         - arm-linux-androideabi
@@ -130,8 +130,8 @@ jobs:
           disable_assert_instr: true
         - target: s390x-unknown-linux-gnu
           os: ubuntu-latest
-        - target: wasm32-wasi
-          os: ubuntu-latest
+        #- target: wasm32-wasi
+        #  os: ubuntu-latest
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
         - target: x86_64-apple-darwin