From 6dc5a957ac34b6682ac36af2a68cb0a155589483 Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 29 Apr 2025 13:49:36 +0530 Subject: [PATCH 1/2] Fix `stdarch-verify` --- crates/core_arch/src/arm_shared/neon/generated.rs | 2 +- crates/core_arch/src/x86/sse.rs | 2 +- .../stdarch-gen-arm/spec/neon/arm_shared.spec.yml | 2 +- crates/stdarch-verify/src/lib.rs | 9 +-------- crates/stdarch-verify/tests/x86-intel.rs | 15 +++++++++------ 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index d797c3c589..1842ad646e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -10880,7 +10880,7 @@ pub fn vdupq_n_p16(value: p16) -> poly16x8_t { target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800") )] -pub fn vdupq_n_p8(value: u8) -> poly8x16_t { +pub fn vdupq_n_p8(value: p8) -> poly8x16_t { poly8x16_t::splat(value) } #[doc = "Duplicate vector element to vector or scalar"] diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index a845e822fc..1eca66adc2 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -3050,7 +3050,7 @@ mod tests { } #[simd_test(enable = "sse")] - unsafe fn test_mm_shuffle() { + unsafe fn test_MM_SHUFFLE() { assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index a3b756d557..d58c2ed10e 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -14937,7 +14937,7 @@ intrinsics: - ['vdupq_n_u32', 'u32', 'uint32x4_t', 'vdup.32', 'dup', 'uint32x4_t::splat(value)'] - ['vdupq_n_f32', 'f32', 'float32x4_t', 'vdup.32', 'dup', 'float32x4_t::splat(value)'] - ['vdupq_n_u64', 'u64', 'uint64x2_t', 'vmov', 'dup', 'uint64x2_t::splat(value)'] - - ['vdupq_n_p8', 'u8', 'poly8x16_t', 'vdup.8', 'dup', 'poly8x16_t::splat(value)'] + - ['vdupq_n_p8', 'p8', 'poly8x16_t', 'vdup.8', 'dup', 'poly8x16_t::splat(value)'] - ['vdupq_n_p16', 'p16', 'poly16x8_t', 'vdup.16', 'dup', 'poly16x8_t::splat(value)'] - ['vdup_n_s8', 'i8', 'int8x8_t', 'vdup.8', 'dup', 'int8x8_t::splat(value)'] - ['vdup_n_s16', 'i16', 'int16x4_t', 'vdup.16', 'dup', 'int16x4_t::splat(value)'] diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index a07c150575..38ac9864b5 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -70,14 +70,7 @@ fn functions(input: TokenStream, dirs: &[&str]) -> TokenStream { } assert!(!tests.is_empty()); - functions.retain(|(f, _)| { - if let syn::Visibility::Public(_) = f.vis { - if f.sig.unsafety.is_some() { - return true; - } - } - false - }); + functions.retain(|(f, _)| matches!(f.vis, syn::Visibility::Public(_))); assert!(!functions.is_empty()); let input = proc_macro2::TokenStream::from(input); diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 6719c06608..02b6bdc768 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -244,6 +244,7 @@ fn verify_all_signatures() { "_fxrstor", "_fxrstor64", "_xend", + "_xabort_code", // Aliases "_mm_comige_ss", "_mm_cvt_ss2si", @@ -290,13 +291,10 @@ fn verify_all_signatures() { "__cpuid_count" | "__cpuid" | "__get_cpuid_max" | + "_MM_SHUFFLE" | + "_xabort_code" | // Not listed with intel, but manually verified - "cmpxchg16b" | - // Intel requires the mask argument for _mm_shuffle_ps to be an - // unsigned integer, but all other _mm_shuffle_.. intrinsics - // take a signed-integer. This breaks `_MM_SHUFFLE` for - // `_mm_shuffle_ps` - "_mm_shuffle_ps" + "cmpxchg16b" => continue, _ => {} } @@ -871,6 +869,11 @@ fn equate( // The _mm_stream_load_si128 intrinsic take a mutable pointer in the intrinsics // guide even though they never write through the pointer (&Type::ConstPtr(&Type::M128I), "void*") if intrinsic.name == "_mm_stream_load_si128" => {} + /// Intel requires the mask argument for _mm_shuffle_ps to be an + // unsigned integer, but all other _mm_shuffle_.. intrinsics + // take a signed-integer. This breaks `_MM_SHUFFLE` for + // `_mm_shuffle_ps` + (&Type::PrimSigned(32), "unsigned int") if intrinsic.name == "_mm_shuffle_ps" => {} _ => bail!( "failed to equate: `{intel}` and {t:?} for {}", From 21707d914e729a1bb1c7509018b44de7dfb5d93c Mon Sep 17 00:00:00 2001 From: sayantn Date: Tue, 29 Apr 2025 13:50:26 +0530 Subject: [PATCH 2/2] Implement `sha512`, `sm3` and `sm4` intrinsics --- crates/core_arch/missing-x86.md | 25 -- crates/core_arch/src/x86/sha.rs | 528 +++++++++++++++++++++++++++++++- 2 files changed, 521 insertions(+), 32 deletions(-) diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index a190cbe922..640ec7d0fe 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -213,31 +213,6 @@

-
["SHA512", "AVX"]

- - * [ ] [`_mm256_sha512msg1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512msg1_epi64) - * [ ] [`_mm256_sha512msg2_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512msg2_epi64) - * [ ] [`_mm256_sha512rnds2_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512rnds2_epi64) -

- - -
["SM3", "AVX"]

- - * [ ] [`_mm_sm3msg1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3msg1_epi32) - * [ ] [`_mm_sm3msg2_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3msg2_epi32) - * [ ] [`_mm_sm3rnds2_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3rnds2_epi32) -

- - -
["SM4", "AVX"]

- - * [ ] [`_mm256_sm4key4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sm4key4_epi32) - * [ ] [`_mm256_sm4rnds4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sm4rnds4_epi32) - * [ ] [`_mm_sm4key4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm4key4_epi32) - * [ ] [`_mm_sm4rnds4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm4rnds4_epi32) -

- -
["SSE"]

* [ ] [`_mm_free`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 2bf517869b..8ff64a3d2d 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -16,6 +16,26 @@ unsafe extern "C" { fn sha256msg2(a: i32x4, b: i32x4) -> i32x4; #[link_name = "llvm.x86.sha256rnds2"] fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsha512msg1"] + fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4; + #[link_name = "llvm.x86.vsha512msg2"] + fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4; + #[link_name = "llvm.x86.vsha512rnds2"] + fn vsha512rnds2(a: i64x4, b: i64x4, k: i64x2) -> i64x4; + #[link_name = "llvm.x86.vsm3msg1"] + fn vsm3msg1(a: i32x4, b: i32x4, c: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsm3msg2"] + fn vsm3msg2(a: i32x4, b: i32x4, c: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsm3rnds2"] + fn vsm3rnds2(a: i32x4, b: i32x4, c: i32x4, d: i32) -> i32x4; + #[link_name = "llvm.x86.vsm4key4128"] + fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsm4key4256"] + fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.vsm4rnds4128"] + fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsm4rnds4256"] + fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8; } #[cfg(test)] @@ -118,21 +138,152 @@ pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i { unsafe { transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) } } +/// This intrinsic is one of the two SHA512 message scheduling instructions. +/// The intrinsic performs an intermediate calculation for the next four SHA512 +/// message qwords. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512msg1))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i { + unsafe { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) } +} + +/// This intrinsic is one of the two SHA512 message scheduling instructions. +/// The intrinsic performs the final calculation for the next four SHA512 message +/// qwords. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512msg2))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) } +} + +/// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state +/// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a +/// pre-computed sum of the next two round message qwords and the corresponding +/// round constants from `c` (only the two lower qwords of the third operand). The +/// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the +/// updated state `(C,D,G,H)` in later rounds. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512rnds2))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, k: __m128i) -> __m256i { + unsafe { transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), k.as_i64x2())) } +} + +/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs +/// an initial calculation for the next four SM3 message words. The calculated results +/// are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32) +#[inline] +#[target_feature(enable = "sm3,avx")] +#[cfg_attr(test, assert_instr(vsm3msg1))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm_sm3msg1_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + unsafe { transmute(vsm3msg1(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) } +} + +/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs +/// the final calculation for the next four SM3 message words. The calculated results +/// are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32) +#[inline] +#[target_feature(enable = "sm3,avx")] +#[cfg_attr(test, assert_instr(vsm3msg2))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm_sm3msg2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + unsafe { transmute(vsm3msg2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) } +} + +/// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)` +/// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the +/// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables +/// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8` +/// should contain the even round number for the first of the two rounds computed by this instruction. +/// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers +/// from 0 through 62 are used for this operation. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32) +#[inline] +#[target_feature(enable = "sm3,avx")] +#[cfg_attr(test, assert_instr(vsm3rnds2, IMM8 = 0))] +#[rustc_legacy_const_generics(3)] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm_sm3rnds2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + static_assert!( + IMM8 == (IMM8 & 0x3e), + "IMM8 must be an even number in the range `0..=62`" + ); + unsafe { transmute(vsm3rnds2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4(), IMM8)) } +} + +/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent +/// 128-bit lanes. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32) +#[inline] +#[target_feature(enable = "sm4,avx")] +#[cfg_attr(test, assert_instr(vsm4key4))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm_sm4key4_epi32(a: __m128i, b: __m128i) -> __m128i { + unsafe { transmute(vsm4key4128(a.as_i32x4(), b.as_i32x4())) } +} + +/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent +/// 128-bit lanes. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32) +#[inline] +#[target_feature(enable = "sm4,avx")] +#[cfg_attr(test, assert_instr(vsm4key4))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) } +} + +/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent +/// 128-bit lanes. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32) +#[inline] +#[target_feature(enable = "sm4,avx")] +#[cfg_attr(test, assert_instr(vsm4rnds4))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm_sm4rnds4_epi32(a: __m128i, b: __m128i) -> __m128i { + unsafe { transmute(vsm4rnds4128(a.as_i32x4(), b.as_i32x4())) } +} + +/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent +/// 128-bit lanes. The calculated results are stored in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32) +#[inline] +#[target_feature(enable = "sm4,avx")] +#[cfg_attr(test, assert_instr(vsm4rnds4))] +#[unstable(feature = "sha512_sm_x86", issue = "126624")] +pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i { + unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) } +} + #[cfg(test)] mod tests { - use std::{ - f32, f64, - mem::{self, transmute}, - }; - use crate::{ core_arch::{simd::*, x86::*}, hint::black_box, }; use stdarch_test::simd_test; - const NAN: f64 = f64::NAN; - #[simd_test(enable = "sha")] #[allow(overflowing_literals)] unsafe fn test_mm_sha1msg1_epu32() { @@ -215,4 +366,367 @@ mod tests { let r = _mm_sha256rnds2_epu32(a, b, k); assert_eq_m128i(r, expected); } + + static DATA_64: [u64; 10] = [ + 0x0011223344556677, + 0x8899aabbccddeeff, + 0xffeeddccbbaa9988, + 0x7766554433221100, + 0x0123456789abcdef, + 0xfedcba9876543210, + 0x02468ace13579bdf, + 0xfdb97531eca86420, + 0x048c159d26ae37bf, + 0xfb73ea62d951c840, + ]; + + #[simd_test(enable = "sha512,avx")] + unsafe fn test_mm256_sha512msg1_epi64() { + fn s0(word: u64) -> u64 { + word.rotate_right(1) ^ word.rotate_right(8) ^ (word >> 7) + } + + let A = &DATA_64[0..4]; + let B = &DATA_64[4..6]; + + let a = _mm256_loadu_si256(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + + let r = _mm256_sha512msg1_epi64(a, b); + + let e = _mm256_setr_epi64x( + A[0].wrapping_add(s0(A[1])) as _, + A[1].wrapping_add(s0(A[2])) as _, + A[2].wrapping_add(s0(A[3])) as _, + A[3].wrapping_add(s0(B[0])) as _, + ); + + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "sha512,avx")] + unsafe fn test_mm256_sha512msg2_epi64() { + fn s1(word: u64) -> u64 { + word.rotate_right(19) ^ word.rotate_right(61) ^ (word >> 6) + } + + let A = &DATA_64[0..4]; + let B = &DATA_64[4..8]; + + let a = _mm256_loadu_si256(A.as_ptr().cast()); + let b = _mm256_loadu_si256(B.as_ptr().cast()); + + let r = _mm256_sha512msg2_epi64(a, b); + + let e0 = A[0].wrapping_add(s1(B[2])); + let e1 = A[1].wrapping_add(s1(B[3])); + let e = _mm256_setr_epi64x( + e0 as _, + e1 as _, + A[2].wrapping_add(s1(e0)) as _, + A[3].wrapping_add(s1(e1)) as _, + ); + + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "sha512,avx")] + unsafe fn test_mm256_sha512rnds2_epi64() { + fn cap_sigma0(word: u64) -> u64 { + word.rotate_right(28) ^ word.rotate_right(34) ^ word.rotate_right(39) + } + + fn cap_sigma1(word: u64) -> u64 { + word.rotate_right(14) ^ word.rotate_right(18) ^ word.rotate_right(41) + } + + fn maj(a: u64, b: u64, c: u64) -> u64 { + (a & b) ^ (a & c) ^ (b & c) + } + + fn ch(e: u64, f: u64, g: u64) -> u64 { + (e & f) ^ (g & !e) + } + + let A = &DATA_64[0..4]; + let B = &DATA_64[4..8]; + let K = &DATA_64[8..10]; + + let a = _mm256_loadu_si256(A.as_ptr().cast()); + let b = _mm256_loadu_si256(B.as_ptr().cast()); + let k = _mm_loadu_si128(K.as_ptr().cast()); + + let r = _mm256_sha512rnds2_epi64(a, b, k); + + let mut array = [B[3], B[2], A[3], A[2], B[1], B[0], A[1], A[0]]; + for i in 0..2 { + let new_d = ch(array[4], array[5], array[6]) + .wrapping_add(cap_sigma1(array[4])) + .wrapping_add(K[i]) + .wrapping_add(array[7]); + array[7] = new_d + .wrapping_add(maj(array[0], array[1], array[2])) + .wrapping_add(cap_sigma0(array[0])); + array[3] = new_d.wrapping_add(array[3]); + array.rotate_right(1); + } + let e = _mm256_setr_epi64x(array[5] as _, array[4] as _, array[1] as _, array[0] as _); + + assert_eq_m256i(r, e); + } + + static DATA_32: [u32; 16] = [ + 0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544, + 0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf, + 0xfdb97531, 0xeca86420, + ]; + + #[simd_test(enable = "sm3,avx")] + unsafe fn test_mm_sm3msg1_epi32() { + fn p1(x: u32) -> u32 { + x ^ x.rotate_left(15) ^ x.rotate_left(23) + } + let A = &DATA_32[0..4]; + let B = &DATA_32[4..8]; + let C = &DATA_32[8..12]; + + let a = _mm_loadu_si128(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + let c = _mm_loadu_si128(C.as_ptr().cast()); + + let r = _mm_sm3msg1_epi32(a, b, c); + + let e = _mm_setr_epi32( + p1(A[0] ^ C[0] ^ B[0].rotate_left(15)) as _, + p1(A[1] ^ C[1] ^ B[1].rotate_left(15)) as _, + p1(A[2] ^ C[2] ^ B[2].rotate_left(15)) as _, + p1(A[3] ^ C[3]) as _, + ); + + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "sm3,avx")] + unsafe fn test_mm_sm3msg2_epi32() { + let A = &DATA_32[0..4]; + let B = &DATA_32[4..8]; + let C = &DATA_32[8..12]; + + let a = _mm_loadu_si128(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + let c = _mm_loadu_si128(C.as_ptr().cast()); + + let r = _mm_sm3msg2_epi32(a, b, c); + + let e0 = B[0].rotate_left(7) ^ C[0] ^ A[0]; + let e = _mm_setr_epi32( + e0 as _, + (B[1].rotate_left(7) ^ C[1] ^ A[1]) as _, + (B[2].rotate_left(7) ^ C[2] ^ A[2]) as _, + (B[3].rotate_left(7) + ^ C[3] + ^ A[3] + ^ e0.rotate_left(6) + ^ e0.rotate_left(15) + ^ e0.rotate_left(30)) as _, + ); + + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "sm3,avx")] + unsafe fn test_mm_sm3rnds2_epi32() { + fn p0(x: u32) -> u32 { + x ^ x.rotate_left(9) ^ x.rotate_left(17) + } + fn ff(x: u32, y: u32, z: u32, round: u32) -> u32 { + if round < 16 { + x ^ y ^ z + } else { + (x & y) | (x & z) | (y & z) + } + } + fn gg(x: u32, y: u32, z: u32, round: u32) -> u32 { + if round < 16 { + x ^ y ^ z + } else { + (x & y) | (!x & z) + } + } + + const ROUND: u32 = 30; + + let A = &DATA_32[0..4]; + let B = &DATA_32[4..8]; + let C = &DATA_32[8..12]; + + let a = _mm_loadu_si128(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + let c = _mm_loadu_si128(C.as_ptr().cast()); + + let r = _mm_sm3rnds2_epi32::<{ ROUND as i32 }>(a, b, c); + + let CONST: u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a }; + + let mut array = [ + B[3], + B[2], + A[3].rotate_left(9), + A[2].rotate_left(9), + B[1], + B[0], + A[1].rotate_left(19), + A[0].rotate_left(19), + ]; + + for i in 0..2 { + let s1 = array[0] + .rotate_left(12) + .wrapping_add(array[4]) + .wrapping_add(CONST.rotate_left(ROUND as u32 + i as u32)) + .rotate_left(7); + let s2 = s1 ^ array[0].rotate_left(12); + + let t1 = ff(array[0], array[1], array[2], ROUND) + .wrapping_add(array[3]) + .wrapping_add(s2) + .wrapping_add(C[i] ^ C[i + 2]); + let t2 = gg(array[4], array[5], array[6], ROUND) + .wrapping_add(array[7]) + .wrapping_add(s1) + .wrapping_add(C[i]); + + array[3] = array[2]; + array[2] = array[1].rotate_left(9); + array[1] = array[0]; + array[0] = t1; + array[7] = array[6]; + array[6] = array[5].rotate_left(19); + array[5] = array[4]; + array[4] = p0(t2); + } + + let e = _mm_setr_epi32(array[5] as _, array[4] as _, array[1] as _, array[0] as _); + + assert_eq_m128i(r, e); + } + + fn lower_t(x: u32) -> u32 { + static SBOX: [u8; 256] = [ + 0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, + 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, + 0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, + 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, + 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, + 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2, + 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24, + 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, + 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, + 0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, + 0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, + 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, + 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, + 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92, + 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1, + 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, + 0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, + 0xC6, 0x84, 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, + 0xD7, 0xCB, 0x39, 0x48, + ]; + + ((SBOX[(x >> 24) as usize] as u32) << 24) + | ((SBOX[((x >> 16) & 0xff) as usize] as u32) << 16) + | ((SBOX[((x >> 8) & 0xff) as usize] as u32) << 8) + | (SBOX[(x & 0xff) as usize] as u32) + } + + #[simd_test(enable = "sm4,avx")] + unsafe fn test_mm_sm4key4_epi32() { + fn l_key(x: u32) -> u32 { + x ^ x.rotate_left(13) ^ x.rotate_left(23) + } + fn f_key(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 { + x0 ^ l_key(lower_t(x1 ^ x2 ^ x3 ^ rk)) + } + + let A = &DATA_32[0..4]; + let B = &DATA_32[4..8]; + + let a = _mm_loadu_si128(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + + let r = _mm_sm4key4_epi32(a, b); + + let e0 = f_key(A[0], A[1], A[2], A[3], B[0]); + let e1 = f_key(A[1], A[2], A[3], e0, B[1]); + let e2 = f_key(A[2], A[3], e0, e1, B[2]); + let e3 = f_key(A[3], e0, e1, e2, B[3]); + let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _); + + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "sm4,avx")] + unsafe fn test_mm256_sm4key4_epi32() { + let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast()); + let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast()); + let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast()); + let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast()); + + let a = _mm256_set_m128i(a_high, a_low); + let b = _mm256_set_m128i(b_high, b_low); + + let r = _mm256_sm4key4_epi32(a, b); + + let e_low = _mm_sm4key4_epi32(a_low, b_low); + let e_high = _mm_sm4key4_epi32(a_high, b_high); + let e = _mm256_set_m128i(e_high, e_low); + + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "sm4,avx")] + unsafe fn test_mm_sm4rnds4_epi32() { + fn l_rnd(x: u32) -> u32 { + x ^ x.rotate_left(2) ^ x.rotate_left(10) ^ x.rotate_left(18) ^ x.rotate_left(24) + } + fn f_rnd(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 { + x0 ^ l_rnd(lower_t(x1 ^ x2 ^ x3 ^ rk)) + } + + let A = &DATA_32[0..4]; + let B = &DATA_32[4..8]; + + let a = _mm_loadu_si128(A.as_ptr().cast()); + let b = _mm_loadu_si128(B.as_ptr().cast()); + + let r = _mm_sm4rnds4_epi32(a, b); + + let e0 = f_rnd(A[0], A[1], A[2], A[3], B[0]); + let e1 = f_rnd(A[1], A[2], A[3], e0, B[1]); + let e2 = f_rnd(A[2], A[3], e0, e1, B[2]); + let e3 = f_rnd(A[3], e0, e1, e2, B[3]); + let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _); + + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "sm4,avx")] + unsafe fn test_mm256_sm4rnds4_epi32() { + let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast()); + let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast()); + let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast()); + let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast()); + + let a = _mm256_set_m128i(a_high, a_low); + let b = _mm256_set_m128i(b_high, b_low); + + let r = _mm256_sm4rnds4_epi32(a, b); + + let e_low = _mm_sm4rnds4_epi32(a_low, b_low); + let e_high = _mm_sm4rnds4_epi32(a_high, b_high); + let e = _mm256_set_m128i(e_high, e_low); + + assert_eq_m256i(r, e); + } }