diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs index da04b70342..878c8957f1 100644 --- a/crates/core_arch/src/x86/avx512bf16.rs +++ b/crates/core_arch/src/x86/avx512bf16.rs @@ -30,8 +30,8 @@ extern "C" { fn dpbf16ps_512(a: f32x16, b: i32x16, c: i32x16) -> f32x16; } -/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a +/// Convert packed single-precision (32-bit) floating-point elements in two 128-bit vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a /// 128-bit wide vector. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_cvtne2ps_pbh) #[inline] @@ -41,9 +41,9 @@ pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) } -/// Convert packed single-precision (32-bit) floating-point elements in two vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results -/// in single vector dst using writemask k (elements are copied from src when the +/// Convert packed single-precision (32-bit) floating-point elements in two vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results +/// in single vector dst using writemask k (elements are copied from src when the /// corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_mask_cvtne2ps_pbh) #[inline] @@ -54,279 +54,294 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __ transmute(simd_select_bitmask(k, cvt, src.as_u16x8())) } -/// Convert packed single-precision (32-bit) floating-point elements in two vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results -/// in single vector dst using zeromask k (elements are zeroed out when the corresponding +/// Convert packed single-precision (32-bit) floating-point elements in two vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results +/// in single vector dst using zeromask k (elements are zeroed out when the corresponding /// mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651&avx512techs=AVX512_BF16&text=_mm_maskz_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm_maskz_cvtne2ps_pbh (k: __mmask8, a: __m128, b: __m128) -> __m128bh { +pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh { let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8(); let zero = _mm_setzero_si128().as_u16x8(); transmute(simd_select_bitmask(k, cvt, zero)) } -/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a +/// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a /// 256-bit wide vector. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm256_cvtne2ps_pbh (a: __m256, b: __m256) -> __m256bh { +pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) } -/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b +/// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b /// to packed BF16 (16-bit) floating-point elements and and store the results in single vector /// dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_mask_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm256_mask_cvtne2ps_pbh (src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh { +pub unsafe fn _mm256_mask_cvtne2ps_pbh( + src: __m256bh, + k: __mmask16, + a: __m256, + b: __m256, +) -> __m256bh { let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); transmute(simd_select_bitmask(k, cvt, src.as_u16x16())) } /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b -/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector +/// to packed BF16 (16-bit) floating-point elements, and store the results in single vector /// dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm256_maskz_cvtne2ps_pbh (k: __mmask16, a: __m256, b: __m256) -> __m256bh { +pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh { let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16(); let zero = _mm256_setzero_si256().as_u16x16(); transmute(simd_select_bitmask(k, cvt, zero)) } -/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors +/// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors /// a and b to packed BF16 (16-bit) floating-point elements, and store the results in a /// 512-bit wide vector. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm512_cvtne2ps_pbh (a: __m512, b: __m512) -> __m512bh { +pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) } -/// Convert packed single-precision (32-bit) floating-point elements in two vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results -/// in single vector dst using writemask k (elements are copied from src when the +/// Convert packed single-precision (32-bit) floating-point elements in two vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results +/// in single vector dst using writemask k (elements are copied from src when the /// corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_mask_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm512_mask_cvtne2ps_pbh (src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh { +pub unsafe fn _mm512_mask_cvtne2ps_pbh( + src: __m512bh, + k: __mmask32, + a: __m512, + b: __m512, +) -> __m512bh { let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); transmute(simd_select_bitmask(k, cvt, src.as_u16x32())) } -/// Convert packed single-precision (32-bit) floating-point elements in two vectors -/// a and b to packed BF16 (16-bit) floating-point elements, and store the results -/// in single vector dst using zeromask k (elements are zeroed out when the corresponding +/// Convert packed single-precision (32-bit) floating-point elements in two vectors +/// a and b to packed BF16 (16-bit) floating-point elements, and store the results +/// in single vector dst using zeromask k (elements are zeroed out when the corresponding /// mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtne2ps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))] -pub unsafe fn _mm512_maskz_cvtne2ps_pbh (k: __mmask32, a: __m512, b: __m512) -> __m512bh { +pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh { let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32(); let zero = _mm512_setzero_si512().as_u16x32(); transmute(simd_select_bitmask(k, cvt, zero)) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) /// floating-point elements, and store the results in dst. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm256_cvtneps_pbh (a: __m256) -> __m128bh { +pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh { transmute(cvtneps2bf16_256(a.as_f32x8())) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) -/// floating-point elements, and store the results in dst using writemask k +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using writemask k /// (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm256_mask_cvtneps_pbh (src: __m128bh, k: __mmask8, a: __m256) -> __m128bh { +pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh { let cvt = _mm256_cvtneps_pbh(a).as_u16x8(); transmute(simd_select_bitmask(k, cvt, src.as_u16x8())) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) -/// floating-point elements, and store the results in dst using zeromask k +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using zeromask k /// (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm256_maskz_cvtneps_pbh (k: __mmask8, a: __m256) -> __m128bh { +pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh { let cvt = _mm256_cvtneps_pbh(a).as_u16x8(); let zero = _mm_setzero_si128().as_u16x8(); transmute(simd_select_bitmask(k, cvt, zero)) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) /// floating-point elements, and store the results in dst. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm512_cvtneps_pbh (a: __m512) -> __m256bh { +pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh { transmute(cvtneps2bf16_512(a.as_f32x16())) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) -/// floating-point elements, and store the results in dst using writemask k +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using writemask k /// (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm512_mask_cvtneps_pbh (src: __m256bh, k: __mmask16, a: __m512) -> __m256bh { +pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh { let cvt = _mm512_cvtneps_pbh(a).as_u16x16(); transmute(simd_select_bitmask(k, cvt, src.as_u16x16())) } -/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) -/// floating-point elements, and store the results in dst using zeromask k +/// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit) +/// floating-point elements, and store the results in dst using zeromask k /// (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_cvtneps_pbh) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vcvtneps2bf16"))] -pub unsafe fn _mm512_maskz_cvtneps_pbh (k: __mmask16, a: __m512) -> __m256bh { +pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh { let cvt = _mm512_cvtneps_pbh(a).as_u16x16(); let zero = _mm256_setzero_si256().as_u16x16(); transmute(simd_select_bitmask(k, cvt, zero)) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements /// with elements in src, and store the results in dst. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm_dpbf16_ps (src: __m128, a: __m128bh, b: __m128bh) -> __m128 { +pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 { transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4())) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using writemask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using writemask k /// (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_mask_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm_mask_dpbf16_ps (src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 { +pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 { let rst = _mm_dpbf16_ps(src, a, b).as_f32x4(); transmute(simd_select_bitmask(k, rst, src.as_f32x4())) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using zeromask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using zeromask k /// (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm_maskz_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm_maskz_dpbf16_ps (k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 { +pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 { let rst = _mm_dpbf16_ps(src, a, b).as_f32x4(); let zero = _mm_set1_ps(0.0_f32).as_f32x4(); transmute(simd_select_bitmask(k, rst, zero)) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements /// with elements in src, and store the results in dst. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm256_dpbf16_ps (src: __m256, a: __m256bh, b: __m256bh) -> __m256 { +pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8())) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using writemask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using writemask k /// (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_mask_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm256_mask_dpbf16_ps (src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 { +pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 { let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8(); transmute(simd_select_bitmask(k, rst, src.as_f32x8())) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using zeromask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using zeromask k /// (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm256_maskz_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512vl")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm256_maskz_dpbf16_ps (k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 { +pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 { let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8(); let zero = _mm256_setzero_ps().as_f32x8(); transmute(simd_select_bitmask(k, rst, zero)) } -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) -/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst.Compute dot-product of BF16 (16-bit) +/// floating-point pairs in a and b, accumulating the intermediate single-precision (32-bit) /// floating-point elements with elements in src, and store the results in dst. /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm512_dpbf16_ps (src: __m512, a: __m512bh, b: __m512bh) -> __m512 { +pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16())) - } +} -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using writemask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using writemask k /// (elements are copied from src when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_mask_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm512_mask_dpbf16_ps (src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 { +pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 { let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16(); transmute(simd_select_bitmask(k, rst, src.as_f32x16())) - } +} -/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, -/// accumulating the intermediate single-precision (32-bit) floating-point elements -/// with elements in src, and store the results in dst using zeromask k +/// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b, +/// accumulating the intermediate single-precision (32-bit) floating-point elements +/// with elements in src, and store the results in dst using zeromask k /// (elements are zeroed out when the corresponding mask bit is not set). /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=1769,1651,1654,1657,1660&avx512techs=AVX512_BF16&text=_mm512_maskz_dpbf16_ps) #[inline] #[target_feature(enable = "avx512bf16,avx512f")] #[cfg_attr(test, assert_instr("vdpbf16ps"))] -pub unsafe fn _mm512_maskz_dpbf16_ps (k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 { +pub unsafe fn _mm512_maskz_dpbf16_ps( + k: __mmask16, + src: __m512, + a: __m512bh, + b: __m512bh, +) -> __m512 { let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); transmute(simd_select_bitmask(k, rst, zero)) - } +} #[cfg(test)] mod tests { @@ -342,18 +357,32 @@ mod tests { let c: __m128bh = _mm_cvtne2ps_pbh(a, b); let result: [u16; 8] = transmute(c.as_u16x8()); let expected_result: [u16; 8] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] - unsafe fn test_mm_mask_cvtne2ps_pbh(){ + unsafe fn test_mm_mask_cvtne2ps_pbh() { let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; let src_array: [u16; 8] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; let src: __m128bh = transmute(src_array); let a: __m128 = transmute(a_array); let b: __m128 = transmute(b_array); @@ -361,8 +390,15 @@ mod tests { let c: __m128bh = _mm_mask_cvtne2ps_pbh(src, k, a, b); let result: [u16; 8] = transmute(c.as_u16x8()); let expected_result: [u16; 8] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; assert_eq!(result, expected_result); let k = 0b0000_0000; let c = _mm_mask_cvtne2ps_pbh(src, k, a, b); @@ -372,7 +408,7 @@ mod tests { } #[simd_test(enable = "avx512bf16,avx512vl")] - unsafe fn test_mm_maskz_cvtne2ps_pbh(){ + unsafe fn test_mm_maskz_cvtne2ps_pbh() { let a_array = [178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32]; let b_array = [-178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32]; let a: __m128 = transmute(a_array); @@ -381,51 +417,119 @@ mod tests { let c: __m128bh = _mm_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 8] = transmute(c.as_u16x8()); let expected_result: [u16; 8] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; assert_eq!(result, expected_result); let k = 0b0011_1100; let c = _mm_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 8] = transmute(c.as_u16x8()); let expected_result: [u16; 8] = [ - 0, 0, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0, 0]; + 0, + 0, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0, + 0, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let a: __m256 = transmute(a_array); let b: __m256 = transmute(b_array); let c: __m256bh = _mm256_cvtne2ps_pbh(a, b); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_mask_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let src_array: [u16; 16] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; let src: __m256bh = transmute(src_array); let a: __m256 = transmute(a_array); let b: __m256 = transmute(b_array); @@ -433,10 +537,23 @@ mod tests { let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0; let c: __m256bh = _mm256_mask_cvtne2ps_pbh(src, k, a, b); @@ -448,82 +565,224 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_maskz_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let a: __m256 = transmute(a_array); let b: __m256 = transmute(b_array); let k: __mmask16 = 0xffff; let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0b0110_1100_0011_0110; let c: __m256bh = _mm256_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0, 0, - 0, 0, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0]; + 0, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0, + 0, + 0, + 0, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32, - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let a: __m512 = transmute(a_array); let b: __m512 = transmute(b_array); let c: __m512bh = _mm512_cvtne2ps_pbh(a, b); let result: [u16; 32] = transmute(c.as_u16x32()); let expected_result: [u16; 32] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_mask_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32, - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let src_array: [u16; 32] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + ]; let src: __m512bh = transmute(src_array); let a: __m512 = transmute(a_array); let b: __m512 = transmute(b_array); @@ -531,14 +790,39 @@ mod tests { let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); let result: [u16; 32] = transmute(c.as_u16x32()); let expected_result: [u16; 32] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask32 = 0; let c: __m512bh = _mm512_mask_cvtne2ps_pbh(src, k, a, b); @@ -550,143 +834,334 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_maskz_cvtne2ps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let b_array = [ - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32, - -178.125_f32, -10.5_f32, -3.75_f32, -50.25_f32, - -16.5_f32, -255.11_f32, -1000.158_f32, -575.575_f32]; + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + -178.125_f32, + -10.5_f32, + -3.75_f32, + -50.25_f32, + -16.5_f32, + -255.11_f32, + -1000.158_f32, + -575.575_f32, + ]; let a: __m512 = transmute(a_array); let b: __m512 = transmute(b_array); let k: __mmask32 = 0xffffffff; let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 32] = transmute(c.as_u16x32()); let expected_result: [u16; 32] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask32 = 0b1100_1010_1001_0110_1010_0011_0101_0110; let c: __m512bh = _mm512_maskz_cvtne2ps_pbh(k, a, b); let result: [u16; 32] = transmute(c.as_u16x32()); let expected_result: [u16; 32] = [ - 0, 0b1_10000010_0101000, 0b1_10000000_1110000, 0, - 0b1_10000011_0000100, 0, 0b1_10001000_1111010, 0, - 0b1_10000110_0110010, 0b1_10000010_0101000, 0, 0, - 0, 0b1_10000110_1111111, 0, 0b1_10001000_0010000, - 0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, - 0b0_10000011_0000100, 0, 0, 0b0_10001000_0010000, - 0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001, - 0, 0, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0, + 0b1_10000011_0000100, + 0, + 0b1_10001000_1111010, + 0, + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0, + 0, + 0, + 0b1_10000110_1111111, + 0, + 0b1_10001000_0010000, + 0, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0, + 0b0_10000011_0000100, + 0, + 0, + 0b0_10001000_0010000, + 0, + 0b0_10000010_0101000, + 0, + 0b0_10000100_1001001, + 0, + 0, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let a: __m256 = transmute(a_array); let c: __m128bh = _mm256_cvtneps_pbh(a); let result: [u16; 8] = transmute(c.as_u16x8()); let expected_result: [u16; 8] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_mask_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let src_array: [u16; 8] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + ]; let src: __m128bh = transmute(src_array); let a: __m256 = transmute(a_array); let k: __mmask8 = 0xff; let b = _mm256_mask_cvtneps_pbh(src, k, a); let result: [u16; 8] = transmute(b.as_u16x8()); let expected_result: [u16; 8] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0x0; - let b: __m128bh = _mm256_mask_cvtneps_pbh (src, k, a); + let b: __m128bh = _mm256_mask_cvtneps_pbh(src, k, a); let result: [u16; 8] = transmute(b.as_u16x8()); let expected_result: [u16; 8] = src_array; assert_eq!(result, expected_result); } - + #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_maskz_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let a: __m256 = transmute(a_array); let k: __mmask8 = 0xff; let b = _mm256_maskz_cvtneps_pbh(k, a); let result: [u16; 8] = transmute(b.as_u16x8()); let expected_result: [u16; 8] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0x6; - let b: __m128bh = _mm256_maskz_cvtneps_pbh (k, a); + let b: __m128bh = _mm256_maskz_cvtneps_pbh(k, a); let result: [u16; 8] = transmute(b.as_u16x8()); - let expected_result: [u16; 8] = [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0]; + let expected_result: [u16; 8] = + [0, 0b0_10000010_0101000, 0b0_10000000_1110000, 0, 0, 0, 0, 0]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let a: __m512 = transmute(a_array); let c: __m256bh = _mm512_cvtneps_pbh(a); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_mask_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let src_array: [u16; 16] = [ - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000, - 0b1_10000110_0110010, 0b1_10000010_0101000, 0b1_10000000_1110000, 0b1_10000100_1001001, - 0b1_10000011_0000100, 0b1_10000110_1111111, 0b1_10001000_1111010, 0b1_10001000_0010000]; + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + 0b1_10000110_0110010, + 0b1_10000010_0101000, + 0b1_10000000_1110000, + 0b1_10000100_1001001, + 0b1_10000011_0000100, + 0b1_10000110_1111111, + 0b1_10001000_1111010, + 0b1_10001000_0010000, + ]; let src: __m256bh = transmute(src_array); let a: __m512 = transmute(a_array); let k: __mmask16 = 0xffff; let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0; let c: __m256bh = _mm512_mask_cvtneps_pbh(src, k, a); @@ -698,28 +1173,67 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_maskz_cvtneps_pbh() { let a_array = [ - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32, - 178.125_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 16.5_f32, 255.11_f32, 1000.158_f32, 575.575_f32]; + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + 178.125_f32, + 10.5_f32, + 3.75_f32, + 50.25_f32, + 16.5_f32, + 255.11_f32, + 1000.158_f32, + 575.575_f32, + ]; let a: __m512 = transmute(a_array); let k: __mmask16 = 0xffff; let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000, - 0b0_10000110_0110010, 0b0_10000010_0101000, 0b0_10000000_1110000, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0b0_10001000_1111010, 0b0_10001000_0010000]; + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + 0b0_10000110_0110010, + 0b0_10000010_0101000, + 0b0_10000000_1110000, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0b0_10001000_0010000, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0x653a; let c: __m256bh = _mm512_maskz_cvtneps_pbh(k, a); let result: [u16; 16] = transmute(c.as_u16x16()); let expected_result: [u16; 16] = [ - 0, 0b0_10000010_0101000, 0, 0b0_10000100_1001001, - 0b0_10000011_0000100, 0b0_10000110_1111111, 0, 0, - 0b0_10000110_0110010, 0, 0b0_10000000_1110000, 0, - 0, 0b0_10000110_1111111, 0b0_10001000_1111010, 0]; + 0, + 0b0_10000010_0101000, + 0, + 0b0_10000100_1001001, + 0b0_10000011_0000100, + 0b0_10000110_1111111, + 0, + 0, + 0b0_10000110_0110010, + 0, + 0b0_10000000_1110000, + 0, + 0, + 0b0_10000110_1111111, + 0b0_10001000_1111010, + 0, + ]; assert_eq!(result, expected_result); } @@ -732,7 +1246,7 @@ mod tests { let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); - let c: __m128 = _mm_dpbf16_ps (src, a, b); + let c: __m128 = _mm_dpbf16_ps(src, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; assert_eq!(result, expected_result); @@ -748,17 +1262,17 @@ mod tests { let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); - let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b); + let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32]; assert_eq!(result, expected_result); let k: __mmask8 = 0xff; - let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b); + let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; assert_eq!(result, expected_result); let k: __mmask8 = 0; - let c: __m128 = _mm_mask_dpbf16_ps (src, k, a, b); + let c: __m128 = _mm_mask_dpbf16_ps(src, k, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]; assert_eq!(result, expected_result); @@ -774,17 +1288,17 @@ mod tests { let src: __m128 = transmute([1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); let a: __m128bh = _mm_cvtne2ps_pbh(a1, a1); let b: __m128bh = _mm_cvtne2ps_pbh(b1, b1); - let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b); + let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, 0.0, 0.0]; assert_eq!(result, expected_result); let k: __mmask8 = 0xff; - let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b); + let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [-18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; assert_eq!(result, expected_result); let k: __mmask8 = 0; - let c: __m128 = _mm_maskz_dpbf16_ps (k, src, a, b); + let c: __m128 = _mm_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 4] = transmute(c.as_f32x4()); let expected_result: [f32; 4] = [0.0, 0.0, 0.0, 0.0]; assert_eq!(result, expected_result); @@ -793,95 +1307,95 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m256 = transmute(a_array); let b1: __m256 = transmute(b_array); let src: __m256 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); - let c: __m256 = _mm256_dpbf16_ps (src, a, b); + let c: __m256 = _mm256_dpbf16_ps(src, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_mask_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m256 = transmute(a_array); let b1: __m256 = transmute(b_array); let k: __mmask8 = 0x33; let src: __m256 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); - let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b); + let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32]; + -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0xff; - let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b); + let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0; - let c: __m256 = _mm256_mask_dpbf16_ps (src, k, a, b); + let c: __m256 = _mm256_mask_dpbf16_ps(src, k, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]; + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512vl")] unsafe fn test_mm256_maskz_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m256 = transmute(a_array); let b1: __m256 = transmute(b_array); let k: __mmask8 = 0x33; let src: __m256 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m256bh = _mm256_cvtne2ps_pbh(a1, a1); let b: __m256bh = _mm256_cvtne2ps_pbh(b1, b1); - let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b); + let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - -18.0_f32, -52.0_f32, 0.0, 0.0, - -18.0_f32, -52.0_f32, 0.0, 0.0]; + -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0xff; - let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b); + let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask8 = 0; - let c: __m256 = _mm256_maskz_dpbf16_ps (k, src, a, b); + let c: __m256 = _mm256_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 8] = transmute(c.as_f32x8()); let expected_result: [f32; 8] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; assert_eq!(result, expected_result); @@ -890,129 +1404,114 @@ mod tests { #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m512 = transmute(a_array); let b1: __m512 = transmute(b_array); - let src :__m512 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + let src: __m512 = transmute([ + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, + 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); - let c: __m512 = _mm512_dpbf16_ps (src, a, b); + let c: __m512 = _mm512_dpbf16_ps(src, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_mask_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m512 = transmute(a_array); let b1: __m512 = transmute(b_array); let k: __mmask16 = 0x3333; let src: __m512 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, + 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); - let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b); + let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, - -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32]; + -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, + -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, -18.0_f32, -52.0_f32, 3.0_f32, 4.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0xffff; - let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b); + let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0; - let c: __m512 = _mm512_mask_dpbf16_ps (src, k, a, b); + let c: __m512 = _mm512_mask_dpbf16_ps(src, k, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]; + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, + 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]; assert_eq!(result, expected_result); } #[simd_test(enable = "avx512bf16,avx512f")] unsafe fn test_mm512_maskz_dpbf16_ps() { let a_array = [ - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, - 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32]; + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, 8.5_f32, 10.5_f32, 3.75_f32, 50.25_f32, + ]; let b_array = [ - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, - -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32]; + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, -1.0_f32, + ]; let a1: __m512 = transmute(a_array); let b1: __m512 = transmute(b_array); let k: __mmask16 = 0x3333; let src: __m512 = transmute([ - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, - 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32]); + 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, + 2.0_f32, 3.0_f32, 4.0_f32, 1.0_f32, 2.0_f32, 3.0_f32, 4.0_f32, + ]); let a: __m512bh = _mm512_cvtne2ps_pbh(a1, a1); let b: __m512bh = _mm512_cvtne2ps_pbh(b1, b1); let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - -18.0_f32, -52.0_f32, 0.0, 0.0, - -18.0_f32, -52.0_f32, 0.0, 0.0, - -18.0_f32, -52.0_f32, 0.0, 0.0, - -18.0_f32, -52.0_f32, 0.0, 0.0]; + -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, -18.0_f32, -52.0_f32, + 0.0, 0.0, -18.0_f32, -52.0_f32, 0.0, 0.0, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0xffff; - let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b); + let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, - -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32]; + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, -18.0_f32, -52.0_f32, -16.0_f32, -50.0_f32, + ]; assert_eq!(result, expected_result); let k: __mmask16 = 0; - let c: __m512 = _mm512_maskz_dpbf16_ps (k, src, a, b); + let c: __m512 = _mm512_maskz_dpbf16_ps(k, src, a, b); let result: [f32; 16] = transmute(c.as_f32x16()); let expected_result: [f32; 16] = [ - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + ]; assert_eq!(result, expected_result); } -} \ No newline at end of file +} diff --git a/crates/std_detect/src/detect/cache.rs b/crates/std_detect/src/detect/cache.rs index e79c96dafa..d01a5ea244 100644 --- a/crates/std_detect/src/detect/cache.rs +++ b/crates/std_detect/src/detect/cache.rs @@ -3,9 +3,9 @@ #![allow(dead_code)] // not used on all platforms -use crate::sync::atomic::Ordering; +use core::sync::atomic::Ordering; -use crate::sync::atomic::AtomicUsize; +use core::sync::atomic::AtomicUsize; /// Sets the `bit` of `x`. #[inline] @@ -125,9 +125,16 @@ cfg_if::cfg_if! { if #[cfg(feature = "std_detect_env_override")] { #[inline] fn initialize(mut value: Initializer) -> Initializer { - if let Ok(disable) = crate::env::var("RUST_STD_DETECT_UNSTABLE") { - for v in disable.split(" ") { - let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32)); + let env = unsafe { + libc::getenv(b"RUST_STD_DETECT_UNSTABLE\0".as_ptr() as *const libc::c_char) + }; + if !env.is_null() { + let len = unsafe { libc::strlen(env) }; + let env = unsafe { core::slice::from_raw_parts(env as *const u8, len) }; + if let Ok(disable) = core::str::from_utf8(env) { + for v in disable.split(" ") { + let _ = super::Feature::from_str(v).map(|v| value.unset(v as u32)); + } } } do_initialize(value); diff --git a/crates/std_detect/src/detect/mod.rs b/crates/std_detect/src/detect/mod.rs index 7aedef47d6..1b7768ae8f 100644 --- a/crates/std_detect/src/detect/mod.rs +++ b/crates/std_detect/src/detect/mod.rs @@ -98,10 +98,10 @@ cfg_if! { // On x86/x86_64 no OS specific functionality is required. #[path = "os/x86.rs"] mod os; - } else if #[cfg(all(target_os = "linux", feature = "use_std"))] { + } else if #[cfg(all(target_os = "linux", feature = "libc"))] { #[path = "os/linux/mod.rs"] mod os; - } else if #[cfg(all(target_os = "freebsd", feature = "use_std"))] { + } else if #[cfg(all(target_os = "freebsd", feature = "libc"))] { #[cfg(target_arch = "aarch64")] #[path = "os/aarch64.rs"] mod aarch64; @@ -140,7 +140,7 @@ pub fn features() -> impl Iterator { target_arch = "mips64", ))] { (0_u8..Feature::_last as u8).map(|discriminant: u8| { - let f: Feature = unsafe { crate::mem::transmute(discriminant) }; + let f: Feature = unsafe { core::mem::transmute(discriminant) }; let name: &'static str = f.to_str(); let enabled: bool = check_for(f); (name, enabled) diff --git a/crates/std_detect/src/detect/os/freebsd/auxvec.rs b/crates/std_detect/src/detect/os/freebsd/auxvec.rs index c595ec459b..832ce2252e 100644 --- a/crates/std_detect/src/detect/os/freebsd/auxvec.rs +++ b/crates/std_detect/src/detect/os/freebsd/auxvec.rs @@ -42,7 +42,7 @@ pub(crate) fn auxv() -> Result { /// Tries to read the `key` from the auxiliary vector. fn archauxv(key: usize) -> Result { - use crate::mem; + use core::mem; #[derive(Copy, Clone)] #[repr(C)] diff --git a/crates/std_detect/src/detect/os/freebsd/mod.rs b/crates/std_detect/src/detect/os/freebsd/mod.rs index 4321bce74d..ade7fb6269 100644 --- a/crates/std_detect/src/detect/os/freebsd/mod.rs +++ b/crates/std_detect/src/detect/os/freebsd/mod.rs @@ -5,18 +5,18 @@ mod auxvec; cfg_if::cfg_if! { if #[cfg(target_arch = "aarch64")] { mod aarch64; - pub use self::aarch64::check_for; + pub(crate) use self::aarch64::detect_features; } else if #[cfg(target_arch = "arm")] { mod arm; - pub use self::arm::check_for; + pub(crate) use self::arm::detect_features; } else if #[cfg(target_arch = "powerpc64")] { mod powerpc; - pub use self::powerpc::check_for; + pub(crate) use self::powerpc::detect_features; } else { - use crate::arch::detect::Feature; + use crate::detect::cache; /// Performs run-time feature detection. - pub fn check_for(_x: Feature) -> bool { - false + pub(crate) fn detect_features() -> cache::Initializer { + cache::Initializer::default() } } } diff --git a/crates/std_detect/src/detect/os/linux/aarch64.rs b/crates/std_detect/src/detect/os/linux/aarch64.rs index b1b68f763e..80c36e9b99 100644 --- a/crates/std_detect/src/detect/os/linux/aarch64.rs +++ b/crates/std_detect/src/detect/os/linux/aarch64.rs @@ -1,6 +1,6 @@ //! Run-time feature detection for Aarch64 on Linux. -use super::{auxvec, cpuinfo}; +use super::auxvec; use crate::detect::{bit, cache, Feature}; /// Try to read the features from the auxiliary vector, and if that fails, try @@ -10,7 +10,8 @@ pub(crate) fn detect_features() -> cache::Initializer { let hwcap: AtHwcap = auxv.into(); return hwcap.cache(); } - if let Ok(c) = cpuinfo::CpuInfo::new() { + #[cfg(feature = "std_detect_file_io")] + if let Ok(c) = super::cpuinfo::CpuInfo::new() { let hwcap: AtHwcap = c.into(); return hwcap.cache(); } @@ -77,9 +78,10 @@ impl From for AtHwcap { } } -impl From for AtHwcap { +#[cfg(feature = "std_detect_file_io")] +impl From for AtHwcap { /// Reads AtHwcap from /proc/cpuinfo . - fn from(c: cpuinfo::CpuInfo) -> Self { + fn from(c: super::cpuinfo::CpuInfo) -> Self { let f = &c.field("Features"); AtHwcap { // 64-bit names. FIXME: In 32-bit compatibility mode /proc/cpuinfo will diff --git a/crates/std_detect/src/detect/os/linux/arm.rs b/crates/std_detect/src/detect/os/linux/arm.rs index 4b0cb586bb..66cfd05e80 100644 --- a/crates/std_detect/src/detect/os/linux/arm.rs +++ b/crates/std_detect/src/detect/os/linux/arm.rs @@ -1,6 +1,6 @@ //! Run-time feature detection for ARM on Linux. -use super::{auxvec, cpuinfo}; +use super::auxvec; use crate::detect::{bit, cache, Feature}; /// Try to read the features from the auxiliary vector, and if that fails, try @@ -31,7 +31,8 @@ pub(crate) fn detect_features() -> cache::Initializer { return value; } - if let Ok(c) = cpuinfo::CpuInfo::new() { + #[cfg(feature = "std_detect_file_io")] + if let Ok(c) = super::cpuinfo::CpuInfo::new() { enable_feature( &mut value, Feature::neon, @@ -55,7 +56,8 @@ pub(crate) fn detect_features() -> cache::Initializer { /// Is the CPU known to have a broken NEON unit? /// /// See https://crbug.com/341598. -fn has_broken_neon(cpuinfo: &cpuinfo::CpuInfo) -> bool { +#[cfg(feature = "std_detect_file_io")] +fn has_broken_neon(cpuinfo: &super::cpuinfo::CpuInfo) -> bool { cpuinfo.field("CPU implementer") == "0x51" && cpuinfo.field("CPU architecture") == "7" && cpuinfo.field("CPU variant") == "0x1" diff --git a/crates/std_detect/src/detect/os/linux/auxvec.rs b/crates/std_detect/src/detect/os/linux/auxvec.rs index 6ebae67fbf..d556b23b1d 100644 --- a/crates/std_detect/src/detect/os/linux/auxvec.rs +++ b/crates/std_detect/src/detect/os/linux/auxvec.rs @@ -1,13 +1,16 @@ //! Parses ELF auxiliary vectors. #![cfg_attr(not(target_arch = "aarch64"), allow(dead_code))] -#[cfg(feature = "std_detect_file_io")] -use crate::{fs::File, io::Read}; +pub(crate) const AT_NULL: usize = 0; /// Key to access the CPU Hardware capabilities bitfield. pub(crate) const AT_HWCAP: usize = 16; /// Key to access the CPU Hardware capabilities 2 bitfield. -#[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] +#[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" +))] pub(crate) const AT_HWCAP2: usize = 26; /// Cache HWCAP bitfields of the ELF Auxiliary Vector. @@ -17,7 +20,11 @@ pub(crate) const AT_HWCAP2: usize = 26; #[derive(Debug, Copy, Clone)] pub(crate) struct AuxVec { pub hwcap: usize, - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] pub hwcap2: usize, } @@ -64,7 +71,11 @@ pub(crate) fn auxv() -> Result { } // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] { if let Ok(hwcap2) = getauxval(AT_HWCAP2) { if hwcap != 0 && hwcap2 != 0 { @@ -74,21 +85,11 @@ pub(crate) fn auxv() -> Result { } drop(hwcap); } - #[cfg(feature = "std_detect_file_io")] - { - // If calling getauxval fails, try to read the auxiliary vector from - // its file: - auxv_from_file("/proc/self/auxv") - } - #[cfg(not(feature = "std_detect_file_io"))] - { - Err(()) - } } #[cfg(not(feature = "std_detect_dlsym_getauxval"))] { - let hwcap = unsafe { ffi_getauxval(AT_HWCAP) }; + let hwcap = unsafe { libc::getauxval(AT_HWCAP) }; // Targets with only AT_HWCAP: #[cfg(any(target_arch = "aarch64", target_arch = "mips", target_arch = "mips64"))] @@ -99,14 +100,29 @@ pub(crate) fn auxv() -> Result { } // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] { - let hwcap2 = unsafe { ffi_getauxval(AT_HWCAP2) }; + let hwcap2 = unsafe { libc::getauxval(AT_HWCAP2) }; if hwcap != 0 && hwcap2 != 0 { return Ok(AuxVec { hwcap, hwcap2 }); } } } + + #[cfg(feature = "std_detect_file_io")] + { + // If calling getauxval fails, try to read the auxiliary vector from + // its file: + auxv_from_file("/proc/self/auxv") + } + #[cfg(not(feature = "std_detect_file_io"))] + { + Err(()) + } } /// Tries to read the `key` from the auxiliary vector by calling the @@ -122,7 +138,7 @@ fn getauxval(key: usize) -> Result { return Err(()); } - let ffi_getauxval: F = mem::transmute(ptr); + let ffi_getauxval: F = core::mem::transmute(ptr); Ok(ffi_getauxval(key)) } } @@ -131,7 +147,7 @@ fn getauxval(key: usize) -> Result { /// function returns `Err`. #[cfg(feature = "std_detect_file_io")] fn auxv_from_file(file: &str) -> Result { - let mut file = File::open(file).map_err(|_| ())?; + let file = super::read_file(file)?; // See . // @@ -139,10 +155,11 @@ fn auxv_from_file(file: &str) -> Result { // `AT_EXECFN = 31` to `AT_NULL = 0`. That is, a buffer of // 2*32 `usize` elements is enough to read the whole vector. let mut buf = [0_usize; 64]; - { - let raw: &mut [u8; 64 * mem::size_of::()] = unsafe { mem::transmute(&mut buf) }; - file.read(raw).map_err(|_| ())?; + let len = core::mem::size_of_val(&buf).max(file.len()); + unsafe { + core::ptr::copy_nonoverlapping(file.as_ptr(), buf.as_mut_ptr() as *mut u8, len); } + auxv_from_buf(&buf) } @@ -155,18 +172,24 @@ fn auxv_from_buf(buf: &[usize; 64]) -> Result { { for el in buf.chunks(2) { match el[0] { + AT_NULL => break, AT_HWCAP => return Ok(AuxVec { hwcap: el[1] }), _ => (), } } } // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] { let mut hwcap = None; let mut hwcap2 = None; for el in buf.chunks(2) { match el[0] { + AT_NULL => break, AT_HWCAP => hwcap = Some(el[1]), AT_HWCAP2 => hwcap2 = Some(el[1]), _ => (), @@ -214,7 +237,12 @@ mod tests { // FIXME: on mips/mips64 getauxval returns 0, and /proc/self/auxv // does not always contain the AT_HWCAP key under qemu. - #[cfg(not(any(target_arch = "mips", target_arch = "mips64", target_arch = "powerpc")))] + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] #[test] fn auxv_crate() { let v = auxv(); @@ -224,7 +252,11 @@ mod tests { } // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] { if let Some(hwcap2) = auxv_crate_getauxval(AT_HWCAP2) { let rt_hwcap2 = v.expect("failed to find hwcap2 key").hwcap2; @@ -243,7 +275,7 @@ mod tests { } #[cfg(feature = "std_detect_file_io")] - cfg_if! { + cfg_if::cfg_if! { if #[cfg(target_arch = "arm")] { #[test] fn linux_rpi3() { @@ -264,6 +296,7 @@ mod tests { // want to fall back to /proc/cpuinfo in this case, so // reading should fail. assert_eq!(v.hwcap, 126614527); // assert_eq!(v.hwcap2, 0); + let _ = v; } } else if #[cfg(target_arch = "aarch64")] { #[test] @@ -286,7 +319,14 @@ mod tests { } } + #[cfg(any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] #[test] + #[cfg(feature = "std_detect_file_io")] fn auxv_crate_procfs() { let v = auxv(); if let Some(hwcap) = auxv_crate_getprocfs(AT_HWCAP) { @@ -294,7 +334,11 @@ mod tests { } // Targets with AT_HWCAP and AT_HWCAP2: - #[cfg(any(target_arch = "arm", target_arch = "powerpc64"))] + #[cfg(any( + target_arch = "arm", + target_arch = "powerpc", + target_arch = "powerpc64" + ))] { if let Some(hwcap2) = auxv_crate_getprocfs(AT_HWCAP2) { assert_eq!(v.unwrap().hwcap2, hwcap2); diff --git a/crates/std_detect/src/detect/os/linux/cpuinfo.rs b/crates/std_detect/src/detect/os/linux/cpuinfo.rs index f76c48a4b1..1f403df01f 100644 --- a/crates/std_detect/src/detect/os/linux/cpuinfo.rs +++ b/crates/std_detect/src/detect/os/linux/cpuinfo.rs @@ -1,8 +1,7 @@ //! Parses /proc/cpuinfo #![cfg_attr(not(target_arch = "arm"), allow(dead_code))] -extern crate std; -use self::std::{fs::File, io, io::Read, prelude::v1::*}; +use alloc::string::String; /// cpuinfo pub(crate) struct CpuInfo { @@ -11,11 +10,11 @@ pub(crate) struct CpuInfo { impl CpuInfo { /// Reads /proc/cpuinfo into CpuInfo. - pub(crate) fn new() -> Result { - let mut file = File::open("/proc/cpuinfo")?; - let mut cpui = Self { raw: String::new() }; - file.read_to_string(&mut cpui.raw)?; - Ok(cpui) + pub(crate) fn new() -> Result { + let raw = super::read_file("/proc/cpuinfo")?; + Ok(Self { + raw: String::from_utf8(raw).map_err(|_| ())?, + }) } /// Returns the value of the cpuinfo `field`. pub(crate) fn field(&self, field: &str) -> CpuInfoField { @@ -34,7 +33,7 @@ impl CpuInfo { } #[cfg(test)] - fn from_str(other: &str) -> Result { + fn from_str(other: &str) -> Result { Ok(Self { raw: String::from(other), }) diff --git a/crates/std_detect/src/detect/os/linux/mod.rs b/crates/std_detect/src/detect/os/linux/mod.rs index e02d5e6dcd..4b6776e982 100644 --- a/crates/std_detect/src/detect/os/linux/mod.rs +++ b/crates/std_detect/src/detect/os/linux/mod.rs @@ -1,28 +1,61 @@ //! Run-time feature detection on Linux +//! +#[cfg(feature = "std_detect_file_io")] +use alloc::vec::Vec; mod auxvec; #[cfg(feature = "std_detect_file_io")] mod cpuinfo; -cfg_if! { +#[cfg(feature = "std_detect_file_io")] +fn read_file(path: &str) -> Result, ()> { + let mut path = Vec::from(path.as_bytes()); + path.push(0); + + unsafe { + let file = libc::open(path.as_ptr() as *const libc::c_char, libc::O_RDONLY); + if file == -1 { + return Err(()); + } + + let mut data = Vec::new(); + loop { + data.reserve(4096); + let spare = data.spare_capacity_mut(); + match libc::read(file, spare.as_mut_ptr() as *mut _, spare.len()) { + -1 => { + libc::close(file); + return Err(()); + } + 0 => break, + n => data.set_len(data.len() + n as usize), + } + } + + libc::close(file); + Ok(data) + } +} + +cfg_if::cfg_if! { if #[cfg(target_arch = "aarch64")] { mod aarch64; - pub use self::aarch64::check_for; + pub(crate) use self::aarch64::detect_features; } else if #[cfg(target_arch = "arm")] { mod arm; - pub use self::arm::check_for; + pub(crate) use self::arm::detect_features; } else if #[cfg(any(target_arch = "mips", target_arch = "mips64"))] { mod mips; - pub use self::mips::check_for; + pub(crate) use self::mips::detect_features; } else if #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))] { mod powerpc; - pub use self::powerpc::check_for; + pub(crate) use self::powerpc::detect_features; } else { - use crate::detect::Feature; + use crate::detect::cache; /// Performs run-time feature detection. - pub fn check_for(_x: Feature) -> bool { - false + pub(crate) fn detect_features() -> cache::Initializer { + cache::Initializer::default() } } } diff --git a/crates/std_detect/src/detect/os/linux/powerpc.rs b/crates/std_detect/src/detect/os/linux/powerpc.rs index 97afe49fe5..c3308e8158 100644 --- a/crates/std_detect/src/detect/os/linux/powerpc.rs +++ b/crates/std_detect/src/detect/os/linux/powerpc.rs @@ -1,6 +1,6 @@ //! Run-time feature detection for PowerPC on Linux. -use super::{auxvec, cpuinfo}; +use super::auxvec; use crate::detect::{cache, Feature}; /// Try to read the features from the auxiliary vector, and if that fails, try @@ -27,7 +27,8 @@ pub(crate) fn detect_features() -> cache::Initializer { // PowerPC's /proc/cpuinfo lacks a proper Feature field, // but `altivec` support is indicated in the `cpu` field. - if let Ok(c) = cpuinfo::CpuInfo::new() { + #[cfg(feature = "std_detect_file_io")] + if let Ok(c) = super::cpuinfo::CpuInfo::new() { enable_feature(&mut value, Feature::altivec, c.field("cpu").has("altivec")); return value; } diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs index 436fb00f06..388af2e304 100644 --- a/crates/std_detect/src/detect/os/x86.rs +++ b/crates/std_detect/src/detect/os/x86.rs @@ -1,11 +1,11 @@ //! x86 run-time feature detection is OS independent. #[cfg(target_arch = "x86")] -use crate::arch::x86::*; +use core::arch::x86::*; #[cfg(target_arch = "x86_64")] -use crate::arch::x86_64::*; +use core::arch::x86_64::*; -use crate::mem; +use core::mem; use crate::detect::{bit, cache, Feature}; diff --git a/crates/std_detect/src/lib.rs b/crates/std_detect/src/lib.rs index 8cd02c9616..46cf8fb68d 100644 --- a/crates/std_detect/src/lib.rs +++ b/crates/std_detect/src/lib.rs @@ -15,30 +15,17 @@ #![feature(const_fn, staged_api, stdsimd, doc_cfg, allow_internal_unstable)] #![allow(clippy::shadow_reuse)] #![deny(clippy::missing_inline_in_public_items)] -#![cfg_attr(target_os = "linux", feature(linkage))] #![cfg_attr(all(target_os = "freebsd", target_arch = "aarch64"), feature(llvm_asm))] #![cfg_attr(test, allow(unused_imports))] +#![cfg_attr(feature = "std_detect_file_io", feature(vec_spare_capacity))] #![no_std] -cfg_if::cfg_if! { - if #[cfg(any(feature = "std_detect_file_io", feature = "std_detect_env_override"))] { - #[cfg_attr(test, macro_use(println))] - extern crate std; +#[cfg(feature = "std_detect_file_io")] +extern crate alloc; - #[allow(unused_imports)] - use std::{arch, env, fs, io, mem, sync}; - } else { - #[cfg(test)] - #[macro_use(println)] - extern crate std; - - #[allow(unused_imports)] - use core::{arch, mem, sync}; - } -} - -#[cfg(feature = "std_detect_dlsym_getauxval")] -extern crate libc; +#[cfg(test)] +#[macro_use] +extern crate std; #[doc(hidden)] #[unstable(feature = "stdsimd", issue = "27731")] diff --git a/crates/std_detect/src/mod.rs b/crates/std_detect/src/mod.rs deleted file mode 100644 index b630e7ff38..0000000000 --- a/crates/std_detect/src/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! `std_detect` - -#[doc(hidden)] // unstable implementation detail -#[unstable(feature = "stdsimd", issue = "27731")] -pub mod detect;