diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs
index da04b70342..878c8957f1 100644
--- a/crates/core_arch/src/x86/avx512bf16.rs
+++ b/crates/core_arch/src/x86/avx512bf16.rs
@@ -30,8 +30,8 @@ extern "C" {
     fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16;
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a 
+/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
 /// 128-bit wide vector.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh)
 #[inline]
@@ -41,9 +41,9 @@ pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
     transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
-/// in single vector dst using writemask k (elements are copied from src when the 
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
 /// corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh)
 #[inline]
@@ -54,279 +54,294 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __
     transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
-/// in single vector dst using zeromask k (elements are zeroed out when the corresponding 
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
 /// mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_maskz_cvtne2ps_pbh (k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
     let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
     let zero = _mm_setzero_si128().as_u16x8();
     transmute(simd_select_bitmask(k, cvt, zero))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a 
+/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a
 /// 256-bit wide vector.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_cvtne2ps_pbh (a: __m256, b: __m256) -> __m256bh {
+pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
     transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b 
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
 /// to packed BF16 (16-bit) floating-point elements and and store the results in single vector
 /// dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_mask_cvtne2ps_pbh (src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+pub unsafe fn _mm256_mask_cvtne2ps_pbh(
+    src: __m256bh,
+    k: __mmask16,
+    a: __m256,
+    b: __m256,
+) -> __m256bh {
     let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
     transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
-/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector 
+/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector
 /// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtne2ps_pbh (k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
     let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
     let zero = _mm256_setzero_si256().as_u16x16();
     transmute(simd_select_bitmask(k, cvt, zero))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors 
+/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
 /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a  
 /// 512-bit wide vector.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_cvtne2ps_pbh (a: __m512, b: __m512) -> __m512bh {
+pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
     transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
-/// in single vector dst using writemask k (elements are copied from src when the 
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using writemask k (elements are copied from src when the
 /// corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_mask_cvtne2ps_pbh (src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+pub unsafe fn _mm512_mask_cvtne2ps_pbh(
+    src: __m512bh,
+    k: __mmask32,
+    a: __m512,
+    b: __m512,
+) -> __m512bh {
     let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
     transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in two vectors 
-/// a and b to packed BF16 (16-bit) floating-point elements, and store the results 
-/// in single vector dst using zeromask k (elements are zeroed out when the corresponding 
+/// Convert packed single-precision (32-bit) floating-point elements in two vectors
+/// a and b to packed BF16 (16-bit) floating-point elements, and store the results
+/// in single vector dst using zeromask k (elements are zeroed out when the corresponding
 /// mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtne2ps_pbh (k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
     let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
     let zero = _mm512_setzero_si512().as_u16x32();
     transmute(simd_select_bitmask(k, cvt, zero))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
 /// floating-point elements, and store the results in dst.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_cvtneps_pbh (a: __m256) -> __m128bh {
+pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
     transmute(cvtneps2bf16_256(a.as_f32x8()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
-/// floating-point elements, and store the results in dst using writemask k 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
 /// (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_mask_cvtneps_pbh (src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
     let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
     transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
-/// floating-point elements, and store the results in dst using zeromask k 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
 /// (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtneps_pbh (k: __mmask8, a: __m256) -> __m128bh {
+pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
     let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
     let zero = _mm_setzero_si128().as_u16x8();
     transmute(simd_select_bitmask(k, cvt, zero))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
 /// floating-point elements, and store the results in dst.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_cvtneps_pbh (a: __m512) -> __m256bh {
+pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
     transmute(cvtneps2bf16_512(a.as_f32x16()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
-/// floating-point elements, and store the results in dst using writemask k 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using writemask k
 /// (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_mask_cvtneps_pbh (src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
     let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
     transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) 
-/// floating-point elements, and store the results in dst using zeromask k 
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
+/// floating-point elements, and store the results in dst using zeromask k
 /// (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtneps_pbh (k: __mmask16, a: __m512) -> __m256bh {
+pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
     let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
     let zero = _mm256_setzero_si256().as_u16x16();
     transmute(simd_select_bitmask(k, cvt, zero))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
 /// with elements in src, and store the results in dst.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_dpbf16_ps (src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
     transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using writemask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
 /// (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_mask_dpbf16_ps (src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
     let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
     transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using zeromask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
 /// (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_maskz_dpbf16_ps (k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
     let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
     let zero = _mm_set1_ps(0.0_f32).as_f32x4();
     transmute(simd_select_bitmask(k, rst, zero))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
 /// with elements in src, and store the results in dst.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_dpbf16_ps (src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
     transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using writemask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
 /// (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_mask_dpbf16_ps (src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
     let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
     transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using zeromask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
 /// (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_maskz_dpbf16_ps (k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
     let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, rst, zero))
 }
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) 
-/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit)
+/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit)
 /// floating-point elements with elements in src, and store the results in dst.
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_dpbf16_ps (src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
     transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
- }
+}
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using writemask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using writemask k
 /// (elements are copied from src when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_mask_dpbf16_ps (src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
     let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
     transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
- }
+}
 
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, 
-/// accumulating the intermediate single-precision (32-bit) floating-point elements 
-/// with elements in src, and store the results in dst using zeromask k 
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
+/// accumulating the intermediate single-precision (32-bit) floating-point elements
+/// with elements in src, and store the results in dst using zeromask k
 /// (elements are zeroed out when the corresponding mask bit is not set).
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps)
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_maskz_dpbf16_ps (k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+pub unsafe fn _mm512_maskz_dpbf16_ps(
+    k: __mmask16,
+    src: __m512,
+    a: __m512bh,
+    b: __m512bh,
+) -> __m512 {
     let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, rst, zero))
- }
+}
 
 #[cfg(test)]
 mod tests {
@@ -342,18 +357,32 @@ mod tests {
         let c: __m128bh = _mm_cvtne2ps_pbh(a, b);
         let result: [u16; 8] = transmute(c.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_mask_cvtne2ps_pbh(){
+    unsafe fn test_mm_mask_cvtne2ps_pbh() {
         let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
         let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
         let src_array: [u16; 8] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         let src: __m128bh = transmute(src_array);
         let a: __m128 = transmute(a_array);
         let b: __m128 = transmute(b_array);
@@ -361,8 +390,15 @@ mod tests {
         let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b);
         let result: [u16; 8] = transmute(c.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         assert_eq!(result, expected_result);
         let k = 0b0000_0000;
         let c = _mm_mask_cvtne2ps_pbh(src, k, a, b);
@@ -372,7 +408,7 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
-    unsafe fn test_mm_maskz_cvtne2ps_pbh(){
+    unsafe fn test_mm_maskz_cvtne2ps_pbh() {
         let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32];
         let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32];
         let a: __m128 = transmute(a_array);
@@ -381,51 +417,119 @@ mod tests {
         let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 8] = transmute(c.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         assert_eq!(result, expected_result);
         let k = 0b0011_1100;
         let c = _mm_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 8] = transmute(c.as_u16x8());
         let expected_result: [u16; 8] = [
-            0, 0, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0, 0];
+            0,
+            0,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0,
+            0,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let a: __m256 = transmute(a_array);
         let b: __m256 = transmute(b_array);
         let c: __m256bh = _mm256_cvtne2ps_pbh(a, b);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_mask_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let src_array: [u16; 16] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         let src: __m256bh = transmute(src_array);
         let a: __m256 = transmute(a_array);
         let b: __m256 = transmute(b_array);
@@ -433,10 +537,23 @@ mod tests {
         let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0;
         let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b);
@@ -448,82 +565,224 @@ mod tests {
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_maskz_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let a: __m256 = transmute(a_array);
         let b: __m256 = transmute(b_array);
         let k: __mmask16 = 0xffff;
         let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0b0110_1100_0011_0110;
         let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0, 0,
-            0, 0, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0];
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0,
+            0,
+            0,
+            0,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let a: __m512 = transmute(a_array);
         let b: __m512 = transmute(b_array);
         let c: __m512bh = _mm512_cvtne2ps_pbh(a, b);
         let result: [u16; 32] = transmute(c.as_u16x32());
         let expected_result: [u16; 32] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_mask_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let src_array: [u16; 32] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+        ];
         let src: __m512bh = transmute(src_array);
         let a: __m512 = transmute(a_array);
         let b: __m512 = transmute(b_array);
@@ -531,14 +790,39 @@ mod tests {
         let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
         let result: [u16; 32] = transmute(c.as_u16x32());
         let expected_result: [u16; 32] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask32 = 0;
         let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b);
@@ -550,143 +834,334 @@ mod tests {
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_maskz_cvtne2ps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let b_array = [
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32,
-            -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32,
-            -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32];
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+            -178.125_f32,
+            -10.5_f32,
+            -3.75_f32,
+            -50.25_f32,
+            -16.5_f32,
+            -255.11_f32,
+            -1000.158_f32,
+            -575.575_f32,
+        ];
         let a: __m512 = transmute(a_array);
         let b: __m512 = transmute(b_array);
         let k: __mmask32 = 0xffffffff;
         let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 32] = transmute(c.as_u16x32());
         let expected_result: [u16; 32] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110;
         let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b);
         let result: [u16; 32] = transmute(c.as_u16x32());
         let expected_result: [u16; 32] = [
-            0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0,
-            0b1_10000011_0000100, 0, 0b1_10001000_1111010, 0,
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0, 0,
-            0, 0b1_10000110_1111111, 0, 0b1_10001000_0010000,
-            0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0,
-            0b0_10000011_0000100, 0, 0, 0b0_10001000_0010000,
-            0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001,
-            0, 0, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0,
+            0b1_10000011_0000100,
+            0,
+            0b1_10001000_1111010,
+            0,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0,
+            0,
+            0,
+            0b1_10000110_1111111,
+            0,
+            0b1_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0,
+            0b0_10000011_0000100,
+            0,
+            0,
+            0b0_10001000_0010000,
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0,
+            0,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let a: __m256 = transmute(a_array);
         let c: __m128bh = _mm256_cvtneps_pbh(a);
         let result: [u16; 8] = transmute(c.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_mask_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let src_array: [u16; 8] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
         let src: __m128bh = transmute(src_array);
         let a: __m256 = transmute(a_array);
         let k: __mmask8 = 0xff;
         let b = _mm256_mask_cvtneps_pbh(src, k, a);
         let result: [u16; 8] = transmute(b.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0x0;
-        let b: __m128bh = _mm256_mask_cvtneps_pbh (src, k, a);
+        let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a);
         let result: [u16; 8] = transmute(b.as_u16x8());
         let expected_result: [u16; 8] = src_array;
         assert_eq!(result, expected_result);
     }
-    
+
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_maskz_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let a: __m256 = transmute(a_array);
         let k: __mmask8 = 0xff;
         let b = _mm256_maskz_cvtneps_pbh(k, a);
         let result: [u16; 8] = transmute(b.as_u16x8());
         let expected_result: [u16; 8] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0x6;
-        let b: __m128bh = _mm256_maskz_cvtneps_pbh (k, a);
+        let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a);
         let result: [u16; 8] = transmute(b.as_u16x8());
-        let expected_result: [u16; 8] = [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
+        let expected_result: [u16; 8] =
+            [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let a: __m512 = transmute(a_array);
         let c: __m256bh = _mm512_cvtneps_pbh(a);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_mask_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let src_array: [u16; 16] = [
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000,
-            0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001,
-            0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000];
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+            0b1_10000110_0110010,
+            0b1_10000010_0101000,
+            0b1_10000000_1110000,
+            0b1_10000100_1001001,
+            0b1_10000011_0000100,
+            0b1_10000110_1111111,
+            0b1_10001000_1111010,
+            0b1_10001000_0010000,
+        ];
         let src: __m256bh = transmute(src_array);
         let a: __m512 = transmute(a_array);
         let k: __mmask16 = 0xffff;
         let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0;
         let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a);
@@ -698,28 +1173,67 @@ mod tests {
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_maskz_cvtneps_pbh() {
         let a_array = [
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32,
-            178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32];
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+            178.125_f32,
+            10.5_f32,
+            3.75_f32,
+            50.25_f32,
+            16.5_f32,
+            255.11_f32,
+            1000.158_f32,
+            575.575_f32,
+        ];
         let a: __m512 = transmute(a_array);
         let k: __mmask16 = 0xffff;
         let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000,
-            0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000];
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+            0b0_10000110_0110010,
+            0b0_10000010_0101000,
+            0b0_10000000_1110000,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0b0_10001000_0010000,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0x653a;
         let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a);
         let result: [u16; 16] = transmute(c.as_u16x16());
         let expected_result: [u16; 16] = [
-            0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001,
-            0b0_10000011_0000100, 0b0_10000110_1111111, 0, 0,
-            0b0_10000110_0110010, 0, 0b0_10000000_1110000, 0,
-            0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0];
+            0,
+            0b0_10000010_0101000,
+            0,
+            0b0_10000100_1001001,
+            0b0_10000011_0000100,
+            0b0_10000110_1111111,
+            0,
+            0,
+            0b0_10000110_0110010,
+            0,
+            0b0_10000000_1110000,
+            0,
+            0,
+            0b0_10000110_1111111,
+            0b0_10001000_1111010,
+            0,
+        ];
         assert_eq!(result, expected_result);
     }
 
@@ -732,7 +1246,7 @@ mod tests {
         let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
         let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
         let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_dpbf16_ps (src, a, b);
+        let c: __m128 = _mm_dpbf16_ps(src, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
         assert_eq!(result, expected_result);
@@ -748,17 +1262,17 @@ mod tests {
         let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
         let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
         let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0xff;
-        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0;
-        let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b);
+        let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
         assert_eq!(result, expected_result);
@@ -774,17 +1288,17 @@ mod tests {
         let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
         let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1);
         let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1);
-        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0xff;
-        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0;
-        let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 4] = transmute(c.as_f32x4());
         let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0];
         assert_eq!(result, expected_result);
@@ -793,95 +1307,95 @@ mod tests {
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m256 = transmute(a_array);
         let b1: __m256 = transmute(b_array);
         let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
         let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_dpbf16_ps (src, a, b);
+        let c: __m256 = _mm256_dpbf16_ps(src, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_mask_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m256 = transmute(a_array);
         let b1: __m256 = transmute(b_array);
         let k: __mmask8 = 0x33;
         let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
         let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0xff;
-        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0;
-        let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b);
+        let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512vl")]
     unsafe fn test_mm256_maskz_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m256 = transmute(a_array);
         let b1: __m256 = transmute(b_array);
         let k: __mmask8 = 0x33;
         let src: __m256 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1);
         let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1);
-        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, 0.0, 0.0, 
-            -18.0_f32, -52.0_f32, 0.0, 0.0];
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0xff;
-        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask8 = 0;
-        let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 8] = transmute(c.as_f32x8());
         let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
         assert_eq!(result, expected_result);
@@ -890,129 +1404,114 @@ mod tests {
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m512 = transmute(a_array);
         let b1: __m512 = transmute(b_array);
-        let src :__m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+        let src: __m512 = transmute([
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
         let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
-        let c: __m512 = _mm512_dpbf16_ps (src, a, b);
+        let c: __m512 = _mm512_dpbf16_ps(src, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_mask_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m512 = transmute(a_array);
         let b1: __m512 = transmute(b_array);
         let k: __mmask16 = 0x3333;
         let src: __m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
         let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
-        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, 
-            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32];
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+            -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0xffff;
-        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0;
-        let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b);
+        let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32];
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ];
         assert_eq!(result, expected_result);
     }
 
     #[simd_test(enable = "avx512bf16,avx512f")]
     unsafe fn test_mm512_maskz_dpbf16_ps() {
         let a_array = [
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 
-            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32];
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+            8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32,
+        ];
         let b_array = [
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, 
-            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32];
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+            -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32,
+        ];
         let a1: __m512 = transmute(a_array);
         let b1: __m512 = transmute(b_array);
         let k: __mmask16 = 0x3333;
         let src: __m512 = transmute([
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 
-            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]);
+            1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32,
+            2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32,
+        ]);
         let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1);
         let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1);
         let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, 0.0, 0.0, 
-            -18.0_f32, -52.0_f32, 0.0, 0.0,
-            -18.0_f32, -52.0_f32, 0.0, 0.0,
-            -18.0_f32, -52.0_f32, 0.0, 0.0];
+            -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32,
+            0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0xffff;
-        let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, 
-            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32];
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+            -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32,
+        ];
         assert_eq!(result, expected_result);
         let k: __mmask16 = 0;
-        let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b);
+        let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b);
         let result: [f32; 16] = transmute(c.as_f32x16());
         let expected_result: [f32; 16] = [
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,  
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        ];
         assert_eq!(result, expected_result);
     }
-}
\ No newline at end of file
+}
diff --git a/crates/std_detect/src/detect/cache.rs b/crates/std_detect/src/detect/cache.rs
index e79c96dafa..d01a5ea244 100644
--- a/crates/std_detect/src/detect/cache.rs
+++ b/crates/std_detect/src/detect/cache.rs
@@ -3,9 +3,9 @@
 
 #![allow(dead_code)] // not used on all platforms
 
-use crate::sync::atomic::Ordering;
+use core::sync::atomic::Ordering;
 
-use crate::sync::atomic::AtomicUsize;
+use core::sync::atomic::AtomicUsize;
 
 /// Sets the `bit` of `x`.
 #[inline]
@@ -125,9 +125,16 @@ cfg_if::cfg_if! {
     if #[cfg(feature = "std_detect_env_override")] {
         #[inline]
         fn initialize(mut value: Initializer) -> Initializer {
-            if let Ok(disable) = crate::env::var("RUST_STD_DETECT_UNSTABLE") {
-                for v in disable.split(" ") {
-                    let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32));
+            let env = unsafe {
+                libc::getenv(b"RUST_STD_DETECT_UNSTABLE\0".as_ptr() as *const libc::c_char)
+            };
+            if !env.is_null() {
+                let len = unsafe { libc::strlen(env) };
+                let env = unsafe { core::slice::from_raw_parts(env as *const u8, len) };
+                if let Ok(disable) = core::str::from_utf8(env) {
+                    for v in disable.split(" ") {
+                        let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32));
+                    }
                 }
             }
             do_initialize(value);
diff --git a/crates/std_detect/src/detect/mod.rs b/crates/std_detect/src/detect/mod.rs
index 7aedef47d6..1b7768ae8f 100644
--- a/crates/std_detect/src/detect/mod.rs
+++ b/crates/std_detect/src/detect/mod.rs
@@ -98,10 +98,10 @@ cfg_if! {
         // On x86/x86_64 no OS specific functionality is required.
         #[path = "os/x86.rs"]
         mod os;
-    } else if #[cfg(all(target_os = "linux", feature = "use_std"))] {
+    } else if #[cfg(all(target_os = "linux", feature = "libc"))] {
         #[path = "os/linux/mod.rs"]
         mod os;
-    } else if #[cfg(all(target_os = "freebsd", feature = "use_std"))] {
+    } else if #[cfg(all(target_os = "freebsd", feature = "libc"))] {
         #[cfg(target_arch = "aarch64")]
         #[path = "os/aarch64.rs"]
         mod aarch64;
@@ -140,7 +140,7 @@ pub fn features() -> impl Iterator<Item = (&'static str, bool)> {
             target_arch = "mips64",
         ))] {
             (0_u8..Feature::_last as u8).map(|discriminant: u8| {
-                let f: Feature = unsafe { crate::mem::transmute(discriminant) };
+                let f: Feature = unsafe { core::mem::transmute(discriminant) };
                 let name: &'static str = f.to_str();
                 let enabled: bool = check_for(f);
                 (name, enabled)
diff --git a/crates/std_detect/src/detect/os/freebsd/auxvec.rs b/crates/std_detect/src/detect/os/freebsd/auxvec.rs
index c595ec459b..832ce2252e 100644
--- a/crates/std_detect/src/detect/os/freebsd/auxvec.rs
+++ b/crates/std_detect/src/detect/os/freebsd/auxvec.rs
@@ -42,7 +42,7 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
 
 /// Tries to read the `key` from the auxiliary vector.
 fn archauxv(key: usize) -> Result<usize, ()> {
-    use crate::mem;
+    use core::mem;
 
     #[derive(Copy, Clone)]
     #[repr(C)]
diff --git a/crates/std_detect/src/detect/os/freebsd/mod.rs b/crates/std_detect/src/detect/os/freebsd/mod.rs
index 4321bce74d..ade7fb6269 100644
--- a/crates/std_detect/src/detect/os/freebsd/mod.rs
+++ b/crates/std_detect/src/detect/os/freebsd/mod.rs
@@ -5,18 +5,18 @@ mod auxvec;
 cfg_if::cfg_if! {
     if #[cfg(target_arch = "aarch64")] {
         mod aarch64;
-        pub use self::aarch64::check_for;
+        pub(crate) use self::aarch64::detect_features;
     } else if #[cfg(target_arch = "arm")] {
         mod arm;
-        pub use self::arm::check_for;
+        pub(crate) use self::arm::detect_features;
     } else if #[cfg(target_arch = "powerpc64")] {
         mod powerpc;
-        pub use self::powerpc::check_for;
+        pub(crate) use self::powerpc::detect_features;
     } else {
-        use crate::arch::detect::Feature;
+        use crate::detect::cache;
         /// Performs run-time feature detection.
-        pub fn check_for(_x: Feature) -> bool {
-            false
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
         }
     }
 }
diff --git a/crates/std_detect/src/detect/os/linux/aarch64.rs b/crates/std_detect/src/detect/os/linux/aarch64.rs
index b1b68f763e..80c36e9b99 100644
--- a/crates/std_detect/src/detect/os/linux/aarch64.rs
+++ b/crates/std_detect/src/detect/os/linux/aarch64.rs
@@ -1,6 +1,6 @@
 //! Run-time feature detection for Aarch64 on Linux.
 
-use super::{auxvec, cpuinfo};
+use super::auxvec;
 use crate::detect::{bit, cache, Feature};
 
 /// Try to read the features from the auxiliary vector, and if that fails, try
@@ -10,7 +10,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
         let hwcap: AtHwcap = auxv.into();
         return hwcap.cache();
     }
-    if let Ok(c) = cpuinfo::CpuInfo::new() {
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
         let hwcap: AtHwcap = c.into();
         return hwcap.cache();
     }
@@ -77,9 +78,10 @@ impl From<auxvec::AuxVec> for AtHwcap {
     }
 }
 
-impl From<cpuinfo::CpuInfo> for AtHwcap {
+#[cfg(feature = "std_detect_file_io")]
+impl From<super::cpuinfo::CpuInfo> for AtHwcap {
     /// Reads AtHwcap from /proc/cpuinfo .
-    fn from(c: cpuinfo::CpuInfo) -> Self {
+    fn from(c: super::cpuinfo::CpuInfo) -> Self {
         let f = &c.field("Features");
         AtHwcap {
             // 64-bit names. FIXME: In 32-bit compatibility mode /proc/cpuinfo will
diff --git a/crates/std_detect/src/detect/os/linux/arm.rs b/crates/std_detect/src/detect/os/linux/arm.rs
index 4b0cb586bb..66cfd05e80 100644
--- a/crates/std_detect/src/detect/os/linux/arm.rs
+++ b/crates/std_detect/src/detect/os/linux/arm.rs
@@ -1,6 +1,6 @@
 //! Run-time feature detection for ARM on Linux.
 
-use super::{auxvec, cpuinfo};
+use super::auxvec;
 use crate::detect::{bit, cache, Feature};
 
 /// Try to read the features from the auxiliary vector, and if that fails, try
@@ -31,7 +31,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
         return value;
     }
 
-    if let Ok(c) = cpuinfo::CpuInfo::new() {
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
         enable_feature(
             &mut value,
             Feature::neon,
@@ -55,7 +56,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
 /// Is the CPU known to have a broken NEON unit?
 ///
 /// See https://crbug.com/341598.
-fn has_broken_neon(cpuinfo: &cpuinfo::CpuInfo) -> bool {
+#[cfg(feature = "std_detect_file_io")]
+fn has_broken_neon(cpuinfo: &super::cpuinfo::CpuInfo) -> bool {
     cpuinfo.field("CPU implementer") == "0x51"
         && cpuinfo.field("CPU architecture") == "7"
         && cpuinfo.field("CPU variant") == "0x1"
diff --git a/crates/std_detect/src/detect/os/linux/auxvec.rs b/crates/std_detect/src/detect/os/linux/auxvec.rs
index 6ebae67fbf..d556b23b1d 100644
--- a/crates/std_detect/src/detect/os/linux/auxvec.rs
+++ b/crates/std_detect/src/detect/os/linux/auxvec.rs
@@ -1,13 +1,16 @@
 //! Parses ELF auxiliary vectors.
 #![cfg_attr(not(target_arch = "aarch64"), allow(dead_code))]
 
-#[cfg(feature = "std_detect_file_io")]
-use crate::{fs::File, io::Read};
+pub(crate) const AT_NULL: usize = 0;
 
 /// Key to access the CPU Hardware capabilities bitfield.
 pub(crate) const AT_HWCAP: usize = 16;
 /// Key to access the CPU Hardware capabilities 2 bitfield.
-#[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+#[cfg(any(
+    target_arch = "arm",
+    target_arch = "powerpc",
+    target_arch = "powerpc64"
+))]
 pub(crate) const AT_HWCAP2: usize = 26;
 
 /// Cache HWCAP bitfields of the ELF Auxiliary Vector.
@@ -17,7 +20,11 @@ pub(crate) const AT_HWCAP2: usize = 26;
 #[derive(Debug, Copy, Clone)]
 pub(crate) struct AuxVec {
     pub hwcap: usize,
-    #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+    #[cfg(any(
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
     pub hwcap2: usize,
 }
 
@@ -64,7 +71,11 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
             }
 
             // Targets with AT_HWCAP and AT_HWCAP2:
-            #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+            #[cfg(any(
+                target_arch = "arm",
+                target_arch = "powerpc",
+                target_arch = "powerpc64"
+            ))]
             {
                 if let Ok(hwcap2) = getauxval(AT_HWCAP2) {
                     if hwcap != 0 && hwcap2 != 0 {
@@ -74,21 +85,11 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
             }
             drop(hwcap);
         }
-        #[cfg(feature = "std_detect_file_io")]
-        {
-            // If calling getauxval fails, try to read the auxiliary vector from
-            // its file:
-            auxv_from_file("/proc/self/auxv")
-        }
-        #[cfg(not(feature = "std_detect_file_io"))]
-        {
-            Err(())
-        }
     }
 
     #[cfg(not(feature = "std_detect_dlsym_getauxval"))]
     {
-        let hwcap = unsafe { ffi_getauxval(AT_HWCAP) };
+        let hwcap = unsafe { libc::getauxval(AT_HWCAP) };
 
         // Targets with only AT_HWCAP:
         #[cfg(any(target_arch = "aarch64", target_arch = "mips", target_arch = "mips64"))]
@@ -99,14 +100,29 @@ pub(crate) fn auxv() -> Result<AuxVec, ()> {
         }
 
         // Targets with AT_HWCAP and AT_HWCAP2:
-        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
         {
-            let hwcap2 = unsafe { ffi_getauxval(AT_HWCAP2) };
+            let hwcap2 = unsafe { libc::getauxval(AT_HWCAP2) };
             if hwcap != 0 && hwcap2 != 0 {
                 return Ok(AuxVec { hwcap, hwcap2 });
             }
         }
     }
+
+    #[cfg(feature = "std_detect_file_io")]
+    {
+        // If calling getauxval fails, try to read the auxiliary vector from
+        // its file:
+        auxv_from_file("/proc/self/auxv")
+    }
+    #[cfg(not(feature = "std_detect_file_io"))]
+    {
+        Err(())
+    }
 }
 
 /// Tries to read the `key` from the auxiliary vector by calling the
@@ -122,7 +138,7 @@ fn getauxval(key: usize) -> Result<usize, ()> {
             return Err(());
         }
 
-        let ffi_getauxval: F = mem::transmute(ptr);
+        let ffi_getauxval: F = core::mem::transmute(ptr);
         Ok(ffi_getauxval(key))
     }
 }
@@ -131,7 +147,7 @@ fn getauxval(key: usize) -> Result<usize, ()> {
 /// function returns `Err`.
 #[cfg(feature = "std_detect_file_io")]
 fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
-    let mut file = File::open(file).map_err(|_| ())?;
+    let file = super::read_file(file)?;
 
     // See <https://github.com/torvalds/linux/blob/v3.19/include/uapi/linux/auxvec.h>.
     //
@@ -139,10 +155,11 @@ fn auxv_from_file(file: &str) -> Result<AuxVec, ()> {
     // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of
     // 2*32 `usize` elements is enough to read the whole vector.
     let mut buf = [0_usize; 64];
-    {
-        let raw: &mut [u8; 64 * mem::size_of::<usize>()] = unsafe { mem::transmute(&mut buf) };
-        file.read(raw).map_err(|_| ())?;
+    let len = core::mem::size_of_val(&buf).max(file.len());
+    unsafe {
+        core::ptr::copy_nonoverlapping(file.as_ptr(), buf.as_mut_ptr() as *mut u8, len);
     }
+
     auxv_from_buf(&buf)
 }
 
@@ -155,18 +172,24 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result<AuxVec, ()> {
     {
         for el in buf.chunks(2) {
             match el[0] {
+                AT_NULL => break,
                 AT_HWCAP => return Ok(AuxVec { hwcap: el[1] }),
                 _ => (),
             }
         }
     }
     // Targets with AT_HWCAP and AT_HWCAP2:
-    #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+    #[cfg(any(
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
     {
         let mut hwcap = None;
         let mut hwcap2 = None;
         for el in buf.chunks(2) {
             match el[0] {
+                AT_NULL => break,
                 AT_HWCAP => hwcap = Some(el[1]),
                 AT_HWCAP2 => hwcap2 = Some(el[1]),
                 _ => (),
@@ -214,7 +237,12 @@ mod tests {
 
     // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv
     // does not always contain the AT_HWCAP key under qemu.
-    #[cfg(not(any(target_arch = "mips", target_arch = "mips64", target_arch = "powerpc")))]
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
     #[test]
     fn auxv_crate() {
         let v = auxv();
@@ -224,7 +252,11 @@ mod tests {
         }
 
         // Targets with AT_HWCAP and AT_HWCAP2:
-        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
         {
             if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) {
                 let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2;
@@ -243,7 +275,7 @@ mod tests {
     }
 
     #[cfg(feature = "std_detect_file_io")]
-    cfg_if! {
+    cfg_if::cfg_if! {
         if #[cfg(target_arch = "arm")] {
             #[test]
             fn linux_rpi3() {
@@ -264,6 +296,7 @@ mod tests {
                 // want to fall back to /proc/cpuinfo in this case, so
                 // reading should fail. assert_eq!(v.hwcap, 126614527);
                 // assert_eq!(v.hwcap2, 0);
+                let _ = v;
             }
         } else if #[cfg(target_arch = "aarch64")] {
             #[test]
@@ -286,7 +319,14 @@ mod tests {
         }
     }
 
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    ))]
     #[test]
+    #[cfg(feature = "std_detect_file_io")]
     fn auxv_crate_procfs() {
         let v = auxv();
         if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) {
@@ -294,7 +334,11 @@ mod tests {
         }
 
         // Targets with AT_HWCAP and AT_HWCAP2:
-        #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))]
+        #[cfg(any(
+            target_arch = "arm",
+            target_arch = "powerpc",
+            target_arch = "powerpc64"
+        ))]
         {
             if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) {
                 assert_eq!(v.unwrap().hwcap2, hwcap2);
diff --git a/crates/std_detect/src/detect/os/linux/cpuinfo.rs b/crates/std_detect/src/detect/os/linux/cpuinfo.rs
index f76c48a4b1..1f403df01f 100644
--- a/crates/std_detect/src/detect/os/linux/cpuinfo.rs
+++ b/crates/std_detect/src/detect/os/linux/cpuinfo.rs
@@ -1,8 +1,7 @@
 //! Parses /proc/cpuinfo
 #![cfg_attr(not(target_arch = "arm"), allow(dead_code))]
 
-extern crate std;
-use self::std::{fs::File, io, io::Read, prelude::v1::*};
+use alloc::string::String;
 
 /// cpuinfo
 pub(crate) struct CpuInfo {
@@ -11,11 +10,11 @@ pub(crate) struct CpuInfo {
 
 impl CpuInfo {
     /// Reads /proc/cpuinfo into CpuInfo.
-    pub(crate) fn new() -> Result<Self, io::Error> {
-        let mut file = File::open("/proc/cpuinfo")?;
-        let mut cpui = Self { raw: String::new() };
-        file.read_to_string(&mut cpui.raw)?;
-        Ok(cpui)
+    pub(crate) fn new() -> Result<Self, ()> {
+        let raw = super::read_file("/proc/cpuinfo")?;
+        Ok(Self {
+            raw: String::from_utf8(raw).map_err(|_| ())?,
+        })
     }
     /// Returns the value of the cpuinfo `field`.
     pub(crate) fn field(&self, field: &str) -> CpuInfoField {
@@ -34,7 +33,7 @@ impl CpuInfo {
     }
 
     #[cfg(test)]
-    fn from_str(other: &str) -> Result<Self, ::std::io::Error> {
+    fn from_str(other: &str) -> Result<Self, ()> {
         Ok(Self {
             raw: String::from(other),
         })
diff --git a/crates/std_detect/src/detect/os/linux/mod.rs b/crates/std_detect/src/detect/os/linux/mod.rs
index e02d5e6dcd..4b6776e982 100644
--- a/crates/std_detect/src/detect/os/linux/mod.rs
+++ b/crates/std_detect/src/detect/os/linux/mod.rs
@@ -1,28 +1,61 @@
 //! Run-time feature detection on Linux
+//!
+#[cfg(feature = "std_detect_file_io")]
+use alloc::vec::Vec;
 
 mod auxvec;
 
 #[cfg(feature = "std_detect_file_io")]
 mod cpuinfo;
 
-cfg_if! {
+#[cfg(feature = "std_detect_file_io")]
+fn read_file(path: &str) -> Result<Vec<u8>, ()> {
+    let mut path = Vec::from(path.as_bytes());
+    path.push(0);
+
+    unsafe {
+        let file = libc::open(path.as_ptr() as *const libc::c_char, libc::O_RDONLY);
+        if file == -1 {
+            return Err(());
+        }
+
+        let mut data = Vec::new();
+        loop {
+            data.reserve(4096);
+            let spare = data.spare_capacity_mut();
+            match libc::read(file, spare.as_mut_ptr() as *mut _, spare.len()) {
+                -1 => {
+                    libc::close(file);
+                    return Err(());
+                }
+                0 => break,
+                n => data.set_len(data.len() + n as usize),
+            }
+        }
+
+        libc::close(file);
+        Ok(data)
+    }
+}
+
+cfg_if::cfg_if! {
     if #[cfg(target_arch = "aarch64")] {
         mod aarch64;
-        pub use self::aarch64::check_for;
+        pub(crate) use self::aarch64::detect_features;
     } else if #[cfg(target_arch = "arm")] {
         mod arm;
-        pub use self::arm::check_for;
+        pub(crate) use self::arm::detect_features;
     } else  if #[cfg(any(target_arch = "mips", target_arch = "mips64"))] {
         mod mips;
-        pub use self::mips::check_for;
+        pub(crate) use self::mips::detect_features;
     } else if #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))] {
         mod powerpc;
-        pub use self::powerpc::check_for;
+        pub(crate) use self::powerpc::detect_features;
     } else {
-        use crate::detect::Feature;
+        use crate::detect::cache;
         /// Performs run-time feature detection.
-        pub fn check_for(_x: Feature) -> bool {
-            false
+        pub(crate) fn detect_features() -> cache::Initializer {
+            cache::Initializer::default()
         }
     }
 }
diff --git a/crates/std_detect/src/detect/os/linux/powerpc.rs b/crates/std_detect/src/detect/os/linux/powerpc.rs
index 97afe49fe5..c3308e8158 100644
--- a/crates/std_detect/src/detect/os/linux/powerpc.rs
+++ b/crates/std_detect/src/detect/os/linux/powerpc.rs
@@ -1,6 +1,6 @@
 //! Run-time feature detection for PowerPC on Linux.
 
-use super::{auxvec, cpuinfo};
+use super::auxvec;
 use crate::detect::{cache, Feature};
 
 /// Try to read the features from the auxiliary vector, and if that fails, try
@@ -27,7 +27,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
 
     // PowerPC's /proc/cpuinfo lacks a proper Feature field,
     // but `altivec` support is indicated in the `cpu` field.
-    if let Ok(c) = cpuinfo::CpuInfo::new() {
+    #[cfg(feature = "std_detect_file_io")]
+    if let Ok(c) = super::cpuinfo::CpuInfo::new() {
         enable_feature(&mut value, Feature::altivec, c.field("cpu").has("altivec"));
         return value;
     }
diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs
index 436fb00f06..388af2e304 100644
--- a/crates/std_detect/src/detect/os/x86.rs
+++ b/crates/std_detect/src/detect/os/x86.rs
@@ -1,11 +1,11 @@
 //! x86 run-time feature detection is OS independent.
 
 #[cfg(target_arch = "x86")]
-use crate::arch::x86::*;
+use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
-use crate::arch::x86_64::*;
+use core::arch::x86_64::*;
 
-use crate::mem;
+use core::mem;
 
 use crate::detect::{bit, cache, Feature};
 
diff --git a/crates/std_detect/src/lib.rs b/crates/std_detect/src/lib.rs
index 8cd02c9616..46cf8fb68d 100644
--- a/crates/std_detect/src/lib.rs
+++ b/crates/std_detect/src/lib.rs
@@ -15,30 +15,17 @@
 #![feature(const_fn, staged_api, stdsimd, doc_cfg, allow_internal_unstable)]
 #![allow(clippy::shadow_reuse)]
 #![deny(clippy::missing_inline_in_public_items)]
-#![cfg_attr(target_os = "linux", feature(linkage))]
 #![cfg_attr(all(target_os = "freebsd", target_arch = "aarch64"), feature(llvm_asm))]
 #![cfg_attr(test, allow(unused_imports))]
+#![cfg_attr(feature = "std_detect_file_io", feature(vec_spare_capacity))]
 #![no_std]
 
-cfg_if::cfg_if! {
-    if #[cfg(any(feature = "std_detect_file_io", feature = "std_detect_env_override"))] {
-        #[cfg_attr(test, macro_use(println))]
-        extern crate std;
+#[cfg(feature = "std_detect_file_io")]
+extern crate alloc;
 
-        #[allow(unused_imports)]
-        use std::{arch, env, fs, io, mem, sync};
-    } else {
-        #[cfg(test)]
-        #[macro_use(println)]
-        extern crate std;
-
-        #[allow(unused_imports)]
-        use core::{arch, mem, sync};
-    }
-}
-
-#[cfg(feature = "std_detect_dlsym_getauxval")]
-extern crate libc;
+#[cfg(test)]
+#[macro_use]
+extern crate std;
 
 #[doc(hidden)]
 #[unstable(feature = "stdsimd", issue = "27731")]
diff --git a/crates/std_detect/src/mod.rs b/crates/std_detect/src/mod.rs
deleted file mode 100644
index b630e7ff38..0000000000
--- a/crates/std_detect/src/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-//! `std_detect`
-
-#[doc(hidden)] // unstable implementation detail
-#[unstable(feature = "stdsimd", issue = "27731")]
-pub mod detect;