Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 6154e40

Browse files
sayantnAmanieu
authored andcommittedMay 1, 2025·
Implement sha512, sm3 and sm4 intrinsics
1 parent 08aac47 commit 6154e40

File tree

2 files changed

+521
-32
lines changed

2 files changed

+521
-32
lines changed
 

‎crates/core_arch/missing-x86.md

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -213,31 +213,6 @@
213213
</p></details>
214214

215215

216-
<details><summary>["SHA512", "AVX"]</summary><p>
217-
218-
* [ ] [`_mm256_sha512msg1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512msg1_epi64)
219-
* [ ] [`_mm256_sha512msg2_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512msg2_epi64)
220-
* [ ] [`_mm256_sha512rnds2_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sha512rnds2_epi64)
221-
</p></details>
222-
223-
224-
<details><summary>["SM3", "AVX"]</summary><p>
225-
226-
* [ ] [`_mm_sm3msg1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3msg1_epi32)
227-
* [ ] [`_mm_sm3msg2_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3msg2_epi32)
228-
* [ ] [`_mm_sm3rnds2_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm3rnds2_epi32)
229-
</p></details>
230-
231-
232-
<details><summary>["SM4", "AVX"]</summary><p>
233-
234-
* [ ] [`_mm256_sm4key4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sm4key4_epi32)
235-
* [ ] [`_mm256_sm4rnds4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sm4rnds4_epi32)
236-
* [ ] [`_mm_sm4key4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm4key4_epi32)
237-
* [ ] [`_mm_sm4rnds4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sm4rnds4_epi32)
238-
</p></details>
239-
240-
241216
<details><summary>["SSE"]</summary><p>
242217

243218
* [ ] [`_mm_free`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free)

‎crates/core_arch/src/x86/sha.rs

Lines changed: 521 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,26 @@ unsafe extern "C" {
1616
fn sha256msg2(a: i32x4, b: i32x4) -> i32x4;
1717
#[link_name = "llvm.x86.sha256rnds2"]
1818
fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4;
19+
#[link_name = "llvm.x86.vsha512msg1"]
20+
fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4;
21+
#[link_name = "llvm.x86.vsha512msg2"]
22+
fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4;
23+
#[link_name = "llvm.x86.vsha512rnds2"]
24+
fn vsha512rnds2(a: i64x4, b: i64x4, k: i64x2) -> i64x4;
25+
#[link_name = "llvm.x86.vsm3msg1"]
26+
fn vsm3msg1(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
27+
#[link_name = "llvm.x86.vsm3msg2"]
28+
fn vsm3msg2(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
29+
#[link_name = "llvm.x86.vsm3rnds2"]
30+
fn vsm3rnds2(a: i32x4, b: i32x4, c: i32x4, d: i32) -> i32x4;
31+
#[link_name = "llvm.x86.vsm4key4128"]
32+
fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
33+
#[link_name = "llvm.x86.vsm4key4256"]
34+
fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
35+
#[link_name = "llvm.x86.vsm4rnds4128"]
36+
fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
37+
#[link_name = "llvm.x86.vsm4rnds4256"]
38+
fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
1939
}
2040

2141
#[cfg(test)]
@@ -118,21 +138,152 @@ pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
118138
unsafe { transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) }
119139
}
120140

141+
/// This intrinsic is one of the two SHA512 message scheduling instructions.
142+
/// The intrinsic performs an intermediate calculation for the next four SHA512
143+
/// message qwords. The calculated results are stored in dst.
144+
///
145+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64)
146+
#[inline]
147+
#[target_feature(enable = "sha512,avx")]
148+
#[cfg_attr(test, assert_instr(vsha512msg1))]
149+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
150+
pub fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i {
151+
unsafe { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) }
152+
}
153+
154+
/// This intrinsic is one of the two SHA512 message scheduling instructions.
155+
/// The intrinsic performs the final calculation for the next four SHA512 message
156+
/// qwords. The calculated results are stored in dst.
157+
///
158+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64)
159+
#[inline]
160+
#[target_feature(enable = "sha512,avx")]
161+
#[cfg_attr(test, assert_instr(vsha512msg2))]
162+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
163+
pub fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i {
164+
unsafe { transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) }
165+
}
166+
167+
/// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state
168+
/// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a
169+
/// pre-computed sum of the next two round message qwords and the corresponding
170+
/// round constants from `c` (only the two lower qwords of the third operand). The
171+
/// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the
172+
/// updated state `(C,D,G,H)` in later rounds.
173+
///
174+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64)
175+
#[inline]
176+
#[target_feature(enable = "sha512,avx")]
177+
#[cfg_attr(test, assert_instr(vsha512rnds2))]
178+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
179+
pub fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, k: __m128i) -> __m256i {
180+
unsafe { transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), k.as_i64x2())) }
181+
}
182+
183+
/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
184+
/// an initial calculation for the next four SM3 message words. The calculated results
185+
/// are stored in dst.
186+
///
187+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32)
188+
#[inline]
189+
#[target_feature(enable = "sm3,avx")]
190+
#[cfg_attr(test, assert_instr(vsm3msg1))]
191+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
192+
pub fn _mm_sm3msg1_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
193+
unsafe { transmute(vsm3msg1(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
194+
}
195+
196+
/// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
197+
/// the final calculation for the next four SM3 message words. The calculated results
198+
/// are stored in dst.
199+
///
200+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32)
201+
#[inline]
202+
#[target_feature(enable = "sm3,avx")]
203+
#[cfg_attr(test, assert_instr(vsm3msg2))]
204+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
205+
pub fn _mm_sm3msg2_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
206+
unsafe { transmute(vsm3msg2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
207+
}
208+
209+
/// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)`
210+
/// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the
211+
/// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables
212+
/// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8`
213+
/// should contain the even round number for the first of the two rounds computed by this instruction.
214+
/// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers
215+
/// from 0 through 62 are used for this operation. The calculated results are stored in dst.
216+
///
217+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32)
218+
#[inline]
219+
#[target_feature(enable = "sm3,avx")]
220+
#[cfg_attr(test, assert_instr(vsm3rnds2, IMM8 = 0))]
221+
#[rustc_legacy_const_generics(3)]
222+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
223+
pub fn _mm_sm3rnds2_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
224+
static_assert!(
225+
IMM8 == (IMM8 & 0x3e),
226+
"IMM8 must be an even number in the range `0..=62`"
227+
);
228+
unsafe { transmute(vsm3rnds2(a.as_i32x4(), b.as_i32x4(), c.as_i32x4(), IMM8)) }
229+
}
230+
231+
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
232+
/// 128-bit lanes. The calculated results are stored in dst.
233+
///
234+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32)
235+
#[inline]
236+
#[target_feature(enable = "sm4,avx")]
237+
#[cfg_attr(test, assert_instr(vsm4key4))]
238+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
239+
pub fn _mm_sm4key4_epi32(a: __m128i, b: __m128i) -> __m128i {
240+
unsafe { transmute(vsm4key4128(a.as_i32x4(), b.as_i32x4())) }
241+
}
242+
243+
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
244+
/// 128-bit lanes. The calculated results are stored in dst.
245+
///
246+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32)
247+
#[inline]
248+
#[target_feature(enable = "sm4,avx")]
249+
#[cfg_attr(test, assert_instr(vsm4key4))]
250+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
251+
pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
252+
unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
253+
}
254+
255+
/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
256+
/// 128-bit lanes. The calculated results are stored in dst.
257+
///
258+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32)
259+
#[inline]
260+
#[target_feature(enable = "sm4,avx")]
261+
#[cfg_attr(test, assert_instr(vsm4rnds4))]
262+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
263+
pub fn _mm_sm4rnds4_epi32(a: __m128i, b: __m128i) -> __m128i {
264+
unsafe { transmute(vsm4rnds4128(a.as_i32x4(), b.as_i32x4())) }
265+
}
266+
267+
/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
268+
/// 128-bit lanes. The calculated results are stored in dst.
269+
///
270+
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32)
271+
#[inline]
272+
#[target_feature(enable = "sm4,avx")]
273+
#[cfg_attr(test, assert_instr(vsm4rnds4))]
274+
#[unstable(feature = "sha512_sm_x86", issue = "126624")]
275+
pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
276+
unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
277+
}
278+
121279
#[cfg(test)]
122280
mod tests {
123-
use std::{
124-
f32, f64,
125-
mem::{self, transmute},
126-
};
127-
128281
use crate::{
129282
core_arch::{simd::*, x86::*},
130283
hint::black_box,
131284
};
132285
use stdarch_test::simd_test;
133286

134-
const NAN: f64 = f64::NAN;
135-
136287
#[simd_test(enable = "sha")]
137288
#[allow(overflowing_literals)]
138289
unsafe fn test_mm_sha1msg1_epu32() {
@@ -215,4 +366,367 @@ mod tests {
215366
let r = _mm_sha256rnds2_epu32(a, b, k);
216367
assert_eq_m128i(r, expected);
217368
}
369+
370+
static DATA_64: [u64; 10] = [
371+
0x0011223344556677,
372+
0x8899aabbccddeeff,
373+
0xffeeddccbbaa9988,
374+
0x7766554433221100,
375+
0x0123456789abcdef,
376+
0xfedcba9876543210,
377+
0x02468ace13579bdf,
378+
0xfdb97531eca86420,
379+
0x048c159d26ae37bf,
380+
0xfb73ea62d951c840,
381+
];
382+
383+
#[simd_test(enable = "sha512,avx")]
384+
unsafe fn test_mm256_sha512msg1_epi64() {
385+
fn s0(word: u64) -> u64 {
386+
word.rotate_right(1) ^ word.rotate_right(8) ^ (word >> 7)
387+
}
388+
389+
let A = &DATA_64[0..4];
390+
let B = &DATA_64[4..6];
391+
392+
let a = _mm256_loadu_si256(A.as_ptr().cast());
393+
let b = _mm_loadu_si128(B.as_ptr().cast());
394+
395+
let r = _mm256_sha512msg1_epi64(a, b);
396+
397+
let e = _mm256_setr_epi64x(
398+
A[0].wrapping_add(s0(A[1])) as _,
399+
A[1].wrapping_add(s0(A[2])) as _,
400+
A[2].wrapping_add(s0(A[3])) as _,
401+
A[3].wrapping_add(s0(B[0])) as _,
402+
);
403+
404+
assert_eq_m256i(r, e);
405+
}
406+
407+
#[simd_test(enable = "sha512,avx")]
408+
unsafe fn test_mm256_sha512msg2_epi64() {
409+
fn s1(word: u64) -> u64 {
410+
word.rotate_right(19) ^ word.rotate_right(61) ^ (word >> 6)
411+
}
412+
413+
let A = &DATA_64[0..4];
414+
let B = &DATA_64[4..8];
415+
416+
let a = _mm256_loadu_si256(A.as_ptr().cast());
417+
let b = _mm256_loadu_si256(B.as_ptr().cast());
418+
419+
let r = _mm256_sha512msg2_epi64(a, b);
420+
421+
let e0 = A[0].wrapping_add(s1(B[2]));
422+
let e1 = A[1].wrapping_add(s1(B[3]));
423+
let e = _mm256_setr_epi64x(
424+
e0 as _,
425+
e1 as _,
426+
A[2].wrapping_add(s1(e0)) as _,
427+
A[3].wrapping_add(s1(e1)) as _,
428+
);
429+
430+
assert_eq_m256i(r, e);
431+
}
432+
433+
#[simd_test(enable = "sha512,avx")]
434+
unsafe fn test_mm256_sha512rnds2_epi64() {
435+
fn cap_sigma0(word: u64) -> u64 {
436+
word.rotate_right(28) ^ word.rotate_right(34) ^ word.rotate_right(39)
437+
}
438+
439+
fn cap_sigma1(word: u64) -> u64 {
440+
word.rotate_right(14) ^ word.rotate_right(18) ^ word.rotate_right(41)
441+
}
442+
443+
fn maj(a: u64, b: u64, c: u64) -> u64 {
444+
(a & b) ^ (a & c) ^ (b & c)
445+
}
446+
447+
fn ch(e: u64, f: u64, g: u64) -> u64 {
448+
(e & f) ^ (g & !e)
449+
}
450+
451+
let A = &DATA_64[0..4];
452+
let B = &DATA_64[4..8];
453+
let K = &DATA_64[8..10];
454+
455+
let a = _mm256_loadu_si256(A.as_ptr().cast());
456+
let b = _mm256_loadu_si256(B.as_ptr().cast());
457+
let k = _mm_loadu_si128(K.as_ptr().cast());
458+
459+
let r = _mm256_sha512rnds2_epi64(a, b, k);
460+
461+
let mut array = [B[3], B[2], A[3], A[2], B[1], B[0], A[1], A[0]];
462+
for i in 0..2 {
463+
let new_d = ch(array[4], array[5], array[6])
464+
.wrapping_add(cap_sigma1(array[4]))
465+
.wrapping_add(K[i])
466+
.wrapping_add(array[7]);
467+
array[7] = new_d
468+
.wrapping_add(maj(array[0], array[1], array[2]))
469+
.wrapping_add(cap_sigma0(array[0]));
470+
array[3] = new_d.wrapping_add(array[3]);
471+
array.rotate_right(1);
472+
}
473+
let e = _mm256_setr_epi64x(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
474+
475+
assert_eq_m256i(r, e);
476+
}
477+
478+
static DATA_32: [u32; 16] = [
479+
0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
480+
0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
481+
0xfdb97531, 0xeca86420,
482+
];
483+
484+
#[simd_test(enable = "sm3,avx")]
485+
unsafe fn test_mm_sm3msg1_epi32() {
486+
fn p1(x: u32) -> u32 {
487+
x ^ x.rotate_left(15) ^ x.rotate_left(23)
488+
}
489+
let A = &DATA_32[0..4];
490+
let B = &DATA_32[4..8];
491+
let C = &DATA_32[8..12];
492+
493+
let a = _mm_loadu_si128(A.as_ptr().cast());
494+
let b = _mm_loadu_si128(B.as_ptr().cast());
495+
let c = _mm_loadu_si128(C.as_ptr().cast());
496+
497+
let r = _mm_sm3msg1_epi32(a, b, c);
498+
499+
let e = _mm_setr_epi32(
500+
p1(A[0] ^ C[0] ^ B[0].rotate_left(15)) as _,
501+
p1(A[1] ^ C[1] ^ B[1].rotate_left(15)) as _,
502+
p1(A[2] ^ C[2] ^ B[2].rotate_left(15)) as _,
503+
p1(A[3] ^ C[3]) as _,
504+
);
505+
506+
assert_eq_m128i(r, e);
507+
}
508+
509+
#[simd_test(enable = "sm3,avx")]
510+
unsafe fn test_mm_sm3msg2_epi32() {
511+
let A = &DATA_32[0..4];
512+
let B = &DATA_32[4..8];
513+
let C = &DATA_32[8..12];
514+
515+
let a = _mm_loadu_si128(A.as_ptr().cast());
516+
let b = _mm_loadu_si128(B.as_ptr().cast());
517+
let c = _mm_loadu_si128(C.as_ptr().cast());
518+
519+
let r = _mm_sm3msg2_epi32(a, b, c);
520+
521+
let e0 = B[0].rotate_left(7) ^ C[0] ^ A[0];
522+
let e = _mm_setr_epi32(
523+
e0 as _,
524+
(B[1].rotate_left(7) ^ C[1] ^ A[1]) as _,
525+
(B[2].rotate_left(7) ^ C[2] ^ A[2]) as _,
526+
(B[3].rotate_left(7)
527+
^ C[3]
528+
^ A[3]
529+
^ e0.rotate_left(6)
530+
^ e0.rotate_left(15)
531+
^ e0.rotate_left(30)) as _,
532+
);
533+
534+
assert_eq_m128i(r, e);
535+
}
536+
537+
#[simd_test(enable = "sm3,avx")]
538+
unsafe fn test_mm_sm3rnds2_epi32() {
539+
fn p0(x: u32) -> u32 {
540+
x ^ x.rotate_left(9) ^ x.rotate_left(17)
541+
}
542+
fn ff(x: u32, y: u32, z: u32, round: u32) -> u32 {
543+
if round < 16 {
544+
x ^ y ^ z
545+
} else {
546+
(x & y) | (x & z) | (y & z)
547+
}
548+
}
549+
fn gg(x: u32, y: u32, z: u32, round: u32) -> u32 {
550+
if round < 16 {
551+
x ^ y ^ z
552+
} else {
553+
(x & y) | (!x & z)
554+
}
555+
}
556+
557+
const ROUND: u32 = 30;
558+
559+
let A = &DATA_32[0..4];
560+
let B = &DATA_32[4..8];
561+
let C = &DATA_32[8..12];
562+
563+
let a = _mm_loadu_si128(A.as_ptr().cast());
564+
let b = _mm_loadu_si128(B.as_ptr().cast());
565+
let c = _mm_loadu_si128(C.as_ptr().cast());
566+
567+
let r = _mm_sm3rnds2_epi32::<{ ROUND as i32 }>(a, b, c);
568+
569+
let CONST: u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a };
570+
571+
let mut array = [
572+
B[3],
573+
B[2],
574+
A[3].rotate_left(9),
575+
A[2].rotate_left(9),
576+
B[1],
577+
B[0],
578+
A[1].rotate_left(19),
579+
A[0].rotate_left(19),
580+
];
581+
582+
for i in 0..2 {
583+
let s1 = array[0]
584+
.rotate_left(12)
585+
.wrapping_add(array[4])
586+
.wrapping_add(CONST.rotate_left(ROUND as u32 + i as u32))
587+
.rotate_left(7);
588+
let s2 = s1 ^ array[0].rotate_left(12);
589+
590+
let t1 = ff(array[0], array[1], array[2], ROUND)
591+
.wrapping_add(array[3])
592+
.wrapping_add(s2)
593+
.wrapping_add(C[i] ^ C[i + 2]);
594+
let t2 = gg(array[4], array[5], array[6], ROUND)
595+
.wrapping_add(array[7])
596+
.wrapping_add(s1)
597+
.wrapping_add(C[i]);
598+
599+
array[3] = array[2];
600+
array[2] = array[1].rotate_left(9);
601+
array[1] = array[0];
602+
array[0] = t1;
603+
array[7] = array[6];
604+
array[6] = array[5].rotate_left(19);
605+
array[5] = array[4];
606+
array[4] = p0(t2);
607+
}
608+
609+
let e = _mm_setr_epi32(array[5] as _, array[4] as _, array[1] as _, array[0] as _);
610+
611+
assert_eq_m128i(r, e);
612+
}
613+
614+
fn lower_t(x: u32) -> u32 {
615+
static SBOX: [u8; 256] = [
616+
0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB,
617+
0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26,
618+
0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54,
619+
0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95,
620+
0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73,
621+
0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2,
622+
0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, 0x1E, 0x24,
623+
0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87,
624+
0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4,
625+
0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE,
626+
0xF9, 0x61, 0x15, 0xA1, 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93,
627+
0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60,
628+
0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD,
629+
0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, 0x8D, 0x1B, 0xAF, 0x92,
630+
0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1,
631+
0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0,
632+
0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E,
633+
0xC6, 0x84, 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E,
634+
0xD7, 0xCB, 0x39, 0x48,
635+
];
636+
637+
((SBOX[(x >> 24) as usize] as u32) << 24)
638+
| ((SBOX[((x >> 16) & 0xff) as usize] as u32) << 16)
639+
| ((SBOX[((x >> 8) & 0xff) as usize] as u32) << 8)
640+
| (SBOX[(x & 0xff) as usize] as u32)
641+
}
642+
643+
#[simd_test(enable = "sm4,avx")]
644+
unsafe fn test_mm_sm4key4_epi32() {
645+
fn l_key(x: u32) -> u32 {
646+
x ^ x.rotate_left(13) ^ x.rotate_left(23)
647+
}
648+
fn f_key(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
649+
x0 ^ l_key(lower_t(x1 ^ x2 ^ x3 ^ rk))
650+
}
651+
652+
let A = &DATA_32[0..4];
653+
let B = &DATA_32[4..8];
654+
655+
let a = _mm_loadu_si128(A.as_ptr().cast());
656+
let b = _mm_loadu_si128(B.as_ptr().cast());
657+
658+
let r = _mm_sm4key4_epi32(a, b);
659+
660+
let e0 = f_key(A[0], A[1], A[2], A[3], B[0]);
661+
let e1 = f_key(A[1], A[2], A[3], e0, B[1]);
662+
let e2 = f_key(A[2], A[3], e0, e1, B[2]);
663+
let e3 = f_key(A[3], e0, e1, e2, B[3]);
664+
let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
665+
666+
assert_eq_m128i(r, e);
667+
}
668+
669+
#[simd_test(enable = "sm4,avx")]
670+
unsafe fn test_mm256_sm4key4_epi32() {
671+
let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
672+
let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
673+
let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
674+
let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
675+
676+
let a = _mm256_set_m128i(a_high, a_low);
677+
let b = _mm256_set_m128i(b_high, b_low);
678+
679+
let r = _mm256_sm4key4_epi32(a, b);
680+
681+
let e_low = _mm_sm4key4_epi32(a_low, b_low);
682+
let e_high = _mm_sm4key4_epi32(a_high, b_high);
683+
let e = _mm256_set_m128i(e_high, e_low);
684+
685+
assert_eq_m256i(r, e);
686+
}
687+
688+
#[simd_test(enable = "sm4,avx")]
689+
unsafe fn test_mm_sm4rnds4_epi32() {
690+
fn l_rnd(x: u32) -> u32 {
691+
x ^ x.rotate_left(2) ^ x.rotate_left(10) ^ x.rotate_left(18) ^ x.rotate_left(24)
692+
}
693+
fn f_rnd(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
694+
x0 ^ l_rnd(lower_t(x1 ^ x2 ^ x3 ^ rk))
695+
}
696+
697+
let A = &DATA_32[0..4];
698+
let B = &DATA_32[4..8];
699+
700+
let a = _mm_loadu_si128(A.as_ptr().cast());
701+
let b = _mm_loadu_si128(B.as_ptr().cast());
702+
703+
let r = _mm_sm4rnds4_epi32(a, b);
704+
705+
let e0 = f_rnd(A[0], A[1], A[2], A[3], B[0]);
706+
let e1 = f_rnd(A[1], A[2], A[3], e0, B[1]);
707+
let e2 = f_rnd(A[2], A[3], e0, e1, B[2]);
708+
let e3 = f_rnd(A[3], e0, e1, e2, B[3]);
709+
let e = _mm_setr_epi32(e0 as _, e1 as _, e2 as _, e3 as _);
710+
711+
assert_eq_m128i(r, e);
712+
}
713+
714+
#[simd_test(enable = "sm4,avx")]
715+
unsafe fn test_mm256_sm4rnds4_epi32() {
716+
let a_low = _mm_loadu_si128(DATA_32.as_ptr().cast());
717+
let a_high = _mm_loadu_si128(DATA_32[4..].as_ptr().cast());
718+
let b_low = _mm_loadu_si128(DATA_32[8..].as_ptr().cast());
719+
let b_high = _mm_loadu_si128(DATA_32[12..].as_ptr().cast());
720+
721+
let a = _mm256_set_m128i(a_high, a_low);
722+
let b = _mm256_set_m128i(b_high, b_low);
723+
724+
let r = _mm256_sm4rnds4_epi32(a, b);
725+
726+
let e_low = _mm_sm4rnds4_epi32(a_low, b_low);
727+
let e_high = _mm_sm4rnds4_epi32(a_high, b_high);
728+
let e = _mm256_set_m128i(e_high, e_low);
729+
730+
assert_eq_m256i(r, e);
731+
}
218732
}

0 commit comments

Comments
 (0)
Please sign in to comment.