@@ -16,6 +16,26 @@ unsafe extern "C" {
16
16
fn sha256msg2 ( a : i32x4 , b : i32x4 ) -> i32x4 ;
17
17
#[ link_name = "llvm.x86.sha256rnds2" ]
18
18
fn sha256rnds2 ( a : i32x4 , b : i32x4 , k : i32x4 ) -> i32x4 ;
19
+ #[ link_name = "llvm.x86.vsha512msg1" ]
20
+ fn vsha512msg1 ( a : i64x4 , b : i64x2 ) -> i64x4 ;
21
+ #[ link_name = "llvm.x86.vsha512msg2" ]
22
+ fn vsha512msg2 ( a : i64x4 , b : i64x4 ) -> i64x4 ;
23
+ #[ link_name = "llvm.x86.vsha512rnds2" ]
24
+ fn vsha512rnds2 ( a : i64x4 , b : i64x4 , k : i64x2 ) -> i64x4 ;
25
+ #[ link_name = "llvm.x86.vsm3msg1" ]
26
+ fn vsm3msg1 ( a : i32x4 , b : i32x4 , c : i32x4 ) -> i32x4 ;
27
+ #[ link_name = "llvm.x86.vsm3msg2" ]
28
+ fn vsm3msg2 ( a : i32x4 , b : i32x4 , c : i32x4 ) -> i32x4 ;
29
+ #[ link_name = "llvm.x86.vsm3rnds2" ]
30
+ fn vsm3rnds2 ( a : i32x4 , b : i32x4 , c : i32x4 , d : i32 ) -> i32x4 ;
31
+ #[ link_name = "llvm.x86.vsm4key4128" ]
32
+ fn vsm4key4128 ( a : i32x4 , b : i32x4 ) -> i32x4 ;
33
+ #[ link_name = "llvm.x86.vsm4key4256" ]
34
+ fn vsm4key4256 ( a : i32x8 , b : i32x8 ) -> i32x8 ;
35
+ #[ link_name = "llvm.x86.vsm4rnds4128" ]
36
+ fn vsm4rnds4128 ( a : i32x4 , b : i32x4 ) -> i32x4 ;
37
+ #[ link_name = "llvm.x86.vsm4rnds4256" ]
38
+ fn vsm4rnds4256 ( a : i32x8 , b : i32x8 ) -> i32x8 ;
19
39
}
20
40
21
41
#[ cfg( test) ]
@@ -118,21 +138,152 @@ pub fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m128i {
118
138
unsafe { transmute ( sha256rnds2 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) , k. as_i32x4 ( ) ) ) }
119
139
}
120
140
141
+ /// This intrinsic is one of the two SHA512 message scheduling instructions.
142
+ /// The intrinsic performs an intermediate calculation for the next four SHA512
143
+ /// message qwords. The calculated results are stored in dst.
144
+ ///
145
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64)
146
+ #[ inline]
147
+ #[ target_feature( enable = "sha512,avx" ) ]
148
+ #[ cfg_attr( test, assert_instr( vsha512msg1) ) ]
149
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
150
+ pub fn _mm256_sha512msg1_epi64 ( a : __m256i , b : __m128i ) -> __m256i {
151
+ unsafe { transmute ( vsha512msg1 ( a. as_i64x4 ( ) , b. as_i64x2 ( ) ) ) }
152
+ }
153
+
154
+ /// This intrinsic is one of the two SHA512 message scheduling instructions.
155
+ /// The intrinsic performs the final calculation for the next four SHA512 message
156
+ /// qwords. The calculated results are stored in dst.
157
+ ///
158
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64)
159
+ #[ inline]
160
+ #[ target_feature( enable = "sha512,avx" ) ]
161
+ #[ cfg_attr( test, assert_instr( vsha512msg2) ) ]
162
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
163
+ pub fn _mm256_sha512msg2_epi64 ( a : __m256i , b : __m256i ) -> __m256i {
164
+ unsafe { transmute ( vsha512msg2 ( a. as_i64x4 ( ) , b. as_i64x4 ( ) ) ) }
165
+ }
166
+
167
+ /// This intrinsic performs two rounds of SHA512 operation using initial SHA512 state
168
+ /// `(C,D,G,H)` from `a`, an initial SHA512 state `(A,B,E,F)` from `b`, and a
169
+ /// pre-computed sum of the next two round message qwords and the corresponding
170
+ /// round constants from `c` (only the two lower qwords of the third operand). The
171
+ /// updated SHA512 state `(A,B,E,F)` is written to dst, and dst can be used as the
172
+ /// updated state `(C,D,G,H)` in later rounds.
173
+ ///
174
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64)
175
+ #[ inline]
176
+ #[ target_feature( enable = "sha512,avx" ) ]
177
+ #[ cfg_attr( test, assert_instr( vsha512rnds2) ) ]
178
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
179
+ pub fn _mm256_sha512rnds2_epi64 ( a : __m256i , b : __m256i , k : __m128i ) -> __m256i {
180
+ unsafe { transmute ( vsha512rnds2 ( a. as_i64x4 ( ) , b. as_i64x4 ( ) , k. as_i64x2 ( ) ) ) }
181
+ }
182
+
183
+ /// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
184
+ /// an initial calculation for the next four SM3 message words. The calculated results
185
+ /// are stored in dst.
186
+ ///
187
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg1_epi32)
188
+ #[ inline]
189
+ #[ target_feature( enable = "sm3,avx" ) ]
190
+ #[ cfg_attr( test, assert_instr( vsm3msg1) ) ]
191
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
192
+ pub fn _mm_sm3msg1_epi32 ( a : __m128i , b : __m128i , c : __m128i ) -> __m128i {
193
+ unsafe { transmute ( vsm3msg1 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) , c. as_i32x4 ( ) ) ) }
194
+ }
195
+
196
+ /// This is one of the two SM3 message scheduling intrinsics. The intrinsic performs
197
+ /// the final calculation for the next four SM3 message words. The calculated results
198
+ /// are stored in dst.
199
+ ///
200
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3msg2_epi32)
201
+ #[ inline]
202
+ #[ target_feature( enable = "sm3,avx" ) ]
203
+ #[ cfg_attr( test, assert_instr( vsm3msg2) ) ]
204
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
205
+ pub fn _mm_sm3msg2_epi32 ( a : __m128i , b : __m128i , c : __m128i ) -> __m128i {
206
+ unsafe { transmute ( vsm3msg2 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) , c. as_i32x4 ( ) ) ) }
207
+ }
208
+
209
+ /// The intrinsic performs two rounds of SM3 operation using initial SM3 state `(C, D, G, H)`
210
+ /// from `a`, an initial SM3 states `(A, B, E, F)` from `b` and a pre-computed words from the
211
+ /// `c`. `a` with initial SM3 state of `(C, D, G, H)` assumes input of non-rotated left variables
212
+ /// from previous state. The updated SM3 state `(A, B, E, F)` is written to `a`. The `imm8`
213
+ /// should contain the even round number for the first of the two rounds computed by this instruction.
214
+ /// The computation masks the `imm8` value by ANDing it with `0x3E` so that only even round numbers
215
+ /// from 0 through 62 are used for this operation. The calculated results are stored in dst.
216
+ ///
217
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm3rnds2_epi32)
218
+ #[ inline]
219
+ #[ target_feature( enable = "sm3,avx" ) ]
220
+ #[ cfg_attr( test, assert_instr( vsm3rnds2, IMM8 = 0 ) ) ]
221
+ #[ rustc_legacy_const_generics( 3 ) ]
222
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
223
+ pub fn _mm_sm3rnds2_epi32 < const IMM8 : i32 > ( a : __m128i , b : __m128i , c : __m128i ) -> __m128i {
224
+ static_assert ! (
225
+ IMM8 == ( IMM8 & 0x3e ) ,
226
+ "IMM8 must be an even number in the range `0..=62`"
227
+ ) ;
228
+ unsafe { transmute ( vsm3rnds2 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) , c. as_i32x4 ( ) , IMM8 ) ) }
229
+ }
230
+
231
+ /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
232
+ /// 128-bit lanes. The calculated results are stored in dst.
233
+ ///
234
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4key4_epi32)
235
+ #[ inline]
236
+ #[ target_feature( enable = "sm4,avx" ) ]
237
+ #[ cfg_attr( test, assert_instr( vsm4key4) ) ]
238
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
239
+ pub fn _mm_sm4key4_epi32 ( a : __m128i , b : __m128i ) -> __m128i {
240
+ unsafe { transmute ( vsm4key4128 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) ) ) }
241
+ }
242
+
243
+ /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
244
+ /// 128-bit lanes. The calculated results are stored in dst.
245
+ ///
246
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4key4_epi32)
247
+ #[ inline]
248
+ #[ target_feature( enable = "sm4,avx" ) ]
249
+ #[ cfg_attr( test, assert_instr( vsm4key4) ) ]
250
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
251
+ pub fn _mm256_sm4key4_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
252
+ unsafe { transmute ( vsm4key4256 ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
253
+ }
254
+
255
+ /// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
256
+ /// 128-bit lanes. The calculated results are stored in dst.
257
+ ///
258
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sm4rnds4_epi32)
259
+ #[ inline]
260
+ #[ target_feature( enable = "sm4,avx" ) ]
261
+ #[ cfg_attr( test, assert_instr( vsm4rnds4) ) ]
262
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
263
+ pub fn _mm_sm4rnds4_epi32 ( a : __m128i , b : __m128i ) -> __m128i {
264
+ unsafe { transmute ( vsm4rnds4128 ( a. as_i32x4 ( ) , b. as_i32x4 ( ) ) ) }
265
+ }
266
+
267
+ /// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
268
+ /// 128-bit lanes. The calculated results are stored in dst.
269
+ ///
270
+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sm4rnds4_epi32)
271
+ #[ inline]
272
+ #[ target_feature( enable = "sm4,avx" ) ]
273
+ #[ cfg_attr( test, assert_instr( vsm4rnds4) ) ]
274
+ #[ unstable( feature = "sha512_sm_x86" , issue = "126624" ) ]
275
+ pub fn _mm256_sm4rnds4_epi32 ( a : __m256i , b : __m256i ) -> __m256i {
276
+ unsafe { transmute ( vsm4rnds4256 ( a. as_i32x8 ( ) , b. as_i32x8 ( ) ) ) }
277
+ }
278
+
121
279
#[ cfg( test) ]
122
280
mod tests {
123
- use std:: {
124
- f32, f64,
125
- mem:: { self , transmute} ,
126
- } ;
127
-
128
281
use crate :: {
129
282
core_arch:: { simd:: * , x86:: * } ,
130
283
hint:: black_box,
131
284
} ;
132
285
use stdarch_test:: simd_test;
133
286
134
- const NAN : f64 = f64:: NAN ;
135
-
136
287
#[ simd_test( enable = "sha" ) ]
137
288
#[ allow( overflowing_literals) ]
138
289
unsafe fn test_mm_sha1msg1_epu32 ( ) {
@@ -215,4 +366,367 @@ mod tests {
215
366
let r = _mm_sha256rnds2_epu32 ( a, b, k) ;
216
367
assert_eq_m128i ( r, expected) ;
217
368
}
369
+
370
+ static DATA_64 : [ u64 ; 10 ] = [
371
+ 0x0011223344556677 ,
372
+ 0x8899aabbccddeeff ,
373
+ 0xffeeddccbbaa9988 ,
374
+ 0x7766554433221100 ,
375
+ 0x0123456789abcdef ,
376
+ 0xfedcba9876543210 ,
377
+ 0x02468ace13579bdf ,
378
+ 0xfdb97531eca86420 ,
379
+ 0x048c159d26ae37bf ,
380
+ 0xfb73ea62d951c840 ,
381
+ ] ;
382
+
383
+ #[ simd_test( enable = "sha512,avx" ) ]
384
+ unsafe fn test_mm256_sha512msg1_epi64 ( ) {
385
+ fn s0 ( word : u64 ) -> u64 {
386
+ word. rotate_right ( 1 ) ^ word. rotate_right ( 8 ) ^ ( word >> 7 )
387
+ }
388
+
389
+ let A = & DATA_64 [ 0 ..4 ] ;
390
+ let B = & DATA_64 [ 4 ..6 ] ;
391
+
392
+ let a = _mm256_loadu_si256 ( A . as_ptr ( ) . cast ( ) ) ;
393
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
394
+
395
+ let r = _mm256_sha512msg1_epi64 ( a, b) ;
396
+
397
+ let e = _mm256_setr_epi64x (
398
+ A [ 0 ] . wrapping_add ( s0 ( A [ 1 ] ) ) as _ ,
399
+ A [ 1 ] . wrapping_add ( s0 ( A [ 2 ] ) ) as _ ,
400
+ A [ 2 ] . wrapping_add ( s0 ( A [ 3 ] ) ) as _ ,
401
+ A [ 3 ] . wrapping_add ( s0 ( B [ 0 ] ) ) as _ ,
402
+ ) ;
403
+
404
+ assert_eq_m256i ( r, e) ;
405
+ }
406
+
407
+ #[ simd_test( enable = "sha512,avx" ) ]
408
+ unsafe fn test_mm256_sha512msg2_epi64 ( ) {
409
+ fn s1 ( word : u64 ) -> u64 {
410
+ word. rotate_right ( 19 ) ^ word. rotate_right ( 61 ) ^ ( word >> 6 )
411
+ }
412
+
413
+ let A = & DATA_64 [ 0 ..4 ] ;
414
+ let B = & DATA_64 [ 4 ..8 ] ;
415
+
416
+ let a = _mm256_loadu_si256 ( A . as_ptr ( ) . cast ( ) ) ;
417
+ let b = _mm256_loadu_si256 ( B . as_ptr ( ) . cast ( ) ) ;
418
+
419
+ let r = _mm256_sha512msg2_epi64 ( a, b) ;
420
+
421
+ let e0 = A [ 0 ] . wrapping_add ( s1 ( B [ 2 ] ) ) ;
422
+ let e1 = A [ 1 ] . wrapping_add ( s1 ( B [ 3 ] ) ) ;
423
+ let e = _mm256_setr_epi64x (
424
+ e0 as _ ,
425
+ e1 as _ ,
426
+ A [ 2 ] . wrapping_add ( s1 ( e0) ) as _ ,
427
+ A [ 3 ] . wrapping_add ( s1 ( e1) ) as _ ,
428
+ ) ;
429
+
430
+ assert_eq_m256i ( r, e) ;
431
+ }
432
+
433
+ #[ simd_test( enable = "sha512,avx" ) ]
434
+ unsafe fn test_mm256_sha512rnds2_epi64 ( ) {
435
+ fn cap_sigma0 ( word : u64 ) -> u64 {
436
+ word. rotate_right ( 28 ) ^ word. rotate_right ( 34 ) ^ word. rotate_right ( 39 )
437
+ }
438
+
439
+ fn cap_sigma1 ( word : u64 ) -> u64 {
440
+ word. rotate_right ( 14 ) ^ word. rotate_right ( 18 ) ^ word. rotate_right ( 41 )
441
+ }
442
+
443
+ fn maj ( a : u64 , b : u64 , c : u64 ) -> u64 {
444
+ ( a & b) ^ ( a & c) ^ ( b & c)
445
+ }
446
+
447
+ fn ch ( e : u64 , f : u64 , g : u64 ) -> u64 {
448
+ ( e & f) ^ ( g & !e)
449
+ }
450
+
451
+ let A = & DATA_64 [ 0 ..4 ] ;
452
+ let B = & DATA_64 [ 4 ..8 ] ;
453
+ let K = & DATA_64 [ 8 ..10 ] ;
454
+
455
+ let a = _mm256_loadu_si256 ( A . as_ptr ( ) . cast ( ) ) ;
456
+ let b = _mm256_loadu_si256 ( B . as_ptr ( ) . cast ( ) ) ;
457
+ let k = _mm_loadu_si128 ( K . as_ptr ( ) . cast ( ) ) ;
458
+
459
+ let r = _mm256_sha512rnds2_epi64 ( a, b, k) ;
460
+
461
+ let mut array = [ B [ 3 ] , B [ 2 ] , A [ 3 ] , A [ 2 ] , B [ 1 ] , B [ 0 ] , A [ 1 ] , A [ 0 ] ] ;
462
+ for i in 0 ..2 {
463
+ let new_d = ch ( array[ 4 ] , array[ 5 ] , array[ 6 ] )
464
+ . wrapping_add ( cap_sigma1 ( array[ 4 ] ) )
465
+ . wrapping_add ( K [ i] )
466
+ . wrapping_add ( array[ 7 ] ) ;
467
+ array[ 7 ] = new_d
468
+ . wrapping_add ( maj ( array[ 0 ] , array[ 1 ] , array[ 2 ] ) )
469
+ . wrapping_add ( cap_sigma0 ( array[ 0 ] ) ) ;
470
+ array[ 3 ] = new_d. wrapping_add ( array[ 3 ] ) ;
471
+ array. rotate_right ( 1 ) ;
472
+ }
473
+ let e = _mm256_setr_epi64x ( array[ 5 ] as _ , array[ 4 ] as _ , array[ 1 ] as _ , array[ 0 ] as _ ) ;
474
+
475
+ assert_eq_m256i ( r, e) ;
476
+ }
477
+
478
+ static DATA_32 : [ u32 ; 16 ] = [
479
+ 0x00112233 , 0x44556677 , 0x8899aabb , 0xccddeeff , 0xffeeddcc , 0xbbaa9988 , 0x77665544 ,
480
+ 0x33221100 , 0x01234567 , 0x89abcdef , 0xfedcba98 , 0x76543210 , 0x02468ace , 0x13579bdf ,
481
+ 0xfdb97531 , 0xeca86420 ,
482
+ ] ;
483
+
484
+ #[ simd_test( enable = "sm3,avx" ) ]
485
+ unsafe fn test_mm_sm3msg1_epi32 ( ) {
486
+ fn p1 ( x : u32 ) -> u32 {
487
+ x ^ x. rotate_left ( 15 ) ^ x. rotate_left ( 23 )
488
+ }
489
+ let A = & DATA_32 [ 0 ..4 ] ;
490
+ let B = & DATA_32 [ 4 ..8 ] ;
491
+ let C = & DATA_32 [ 8 ..12 ] ;
492
+
493
+ let a = _mm_loadu_si128 ( A . as_ptr ( ) . cast ( ) ) ;
494
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
495
+ let c = _mm_loadu_si128 ( C . as_ptr ( ) . cast ( ) ) ;
496
+
497
+ let r = _mm_sm3msg1_epi32 ( a, b, c) ;
498
+
499
+ let e = _mm_setr_epi32 (
500
+ p1 ( A [ 0 ] ^ C [ 0 ] ^ B [ 0 ] . rotate_left ( 15 ) ) as _ ,
501
+ p1 ( A [ 1 ] ^ C [ 1 ] ^ B [ 1 ] . rotate_left ( 15 ) ) as _ ,
502
+ p1 ( A [ 2 ] ^ C [ 2 ] ^ B [ 2 ] . rotate_left ( 15 ) ) as _ ,
503
+ p1 ( A [ 3 ] ^ C [ 3 ] ) as _ ,
504
+ ) ;
505
+
506
+ assert_eq_m128i ( r, e) ;
507
+ }
508
+
509
+ #[ simd_test( enable = "sm3,avx" ) ]
510
+ unsafe fn test_mm_sm3msg2_epi32 ( ) {
511
+ let A = & DATA_32 [ 0 ..4 ] ;
512
+ let B = & DATA_32 [ 4 ..8 ] ;
513
+ let C = & DATA_32 [ 8 ..12 ] ;
514
+
515
+ let a = _mm_loadu_si128 ( A . as_ptr ( ) . cast ( ) ) ;
516
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
517
+ let c = _mm_loadu_si128 ( C . as_ptr ( ) . cast ( ) ) ;
518
+
519
+ let r = _mm_sm3msg2_epi32 ( a, b, c) ;
520
+
521
+ let e0 = B [ 0 ] . rotate_left ( 7 ) ^ C [ 0 ] ^ A [ 0 ] ;
522
+ let e = _mm_setr_epi32 (
523
+ e0 as _ ,
524
+ ( B [ 1 ] . rotate_left ( 7 ) ^ C [ 1 ] ^ A [ 1 ] ) as _ ,
525
+ ( B [ 2 ] . rotate_left ( 7 ) ^ C [ 2 ] ^ A [ 2 ] ) as _ ,
526
+ ( B [ 3 ] . rotate_left ( 7 )
527
+ ^ C [ 3 ]
528
+ ^ A [ 3 ]
529
+ ^ e0. rotate_left ( 6 )
530
+ ^ e0. rotate_left ( 15 )
531
+ ^ e0. rotate_left ( 30 ) ) as _ ,
532
+ ) ;
533
+
534
+ assert_eq_m128i ( r, e) ;
535
+ }
536
+
537
+ #[ simd_test( enable = "sm3,avx" ) ]
538
+ unsafe fn test_mm_sm3rnds2_epi32 ( ) {
539
+ fn p0 ( x : u32 ) -> u32 {
540
+ x ^ x. rotate_left ( 9 ) ^ x. rotate_left ( 17 )
541
+ }
542
+ fn ff ( x : u32 , y : u32 , z : u32 , round : u32 ) -> u32 {
543
+ if round < 16 {
544
+ x ^ y ^ z
545
+ } else {
546
+ ( x & y) | ( x & z) | ( y & z)
547
+ }
548
+ }
549
+ fn gg ( x : u32 , y : u32 , z : u32 , round : u32 ) -> u32 {
550
+ if round < 16 {
551
+ x ^ y ^ z
552
+ } else {
553
+ ( x & y) | ( !x & z)
554
+ }
555
+ }
556
+
557
+ const ROUND : u32 = 30 ;
558
+
559
+ let A = & DATA_32 [ 0 ..4 ] ;
560
+ let B = & DATA_32 [ 4 ..8 ] ;
561
+ let C = & DATA_32 [ 8 ..12 ] ;
562
+
563
+ let a = _mm_loadu_si128 ( A . as_ptr ( ) . cast ( ) ) ;
564
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
565
+ let c = _mm_loadu_si128 ( C . as_ptr ( ) . cast ( ) ) ;
566
+
567
+ let r = _mm_sm3rnds2_epi32 :: < { ROUND as i32 } > ( a, b, c) ;
568
+
569
+ let CONST : u32 = if ROUND < 16 { 0x79cc4519 } else { 0x7a879d8a } ;
570
+
571
+ let mut array = [
572
+ B [ 3 ] ,
573
+ B [ 2 ] ,
574
+ A [ 3 ] . rotate_left ( 9 ) ,
575
+ A [ 2 ] . rotate_left ( 9 ) ,
576
+ B [ 1 ] ,
577
+ B [ 0 ] ,
578
+ A [ 1 ] . rotate_left ( 19 ) ,
579
+ A [ 0 ] . rotate_left ( 19 ) ,
580
+ ] ;
581
+
582
+ for i in 0 ..2 {
583
+ let s1 = array[ 0 ]
584
+ . rotate_left ( 12 )
585
+ . wrapping_add ( array[ 4 ] )
586
+ . wrapping_add ( CONST . rotate_left ( ROUND as u32 + i as u32 ) )
587
+ . rotate_left ( 7 ) ;
588
+ let s2 = s1 ^ array[ 0 ] . rotate_left ( 12 ) ;
589
+
590
+ let t1 = ff ( array[ 0 ] , array[ 1 ] , array[ 2 ] , ROUND )
591
+ . wrapping_add ( array[ 3 ] )
592
+ . wrapping_add ( s2)
593
+ . wrapping_add ( C [ i] ^ C [ i + 2 ] ) ;
594
+ let t2 = gg ( array[ 4 ] , array[ 5 ] , array[ 6 ] , ROUND )
595
+ . wrapping_add ( array[ 7 ] )
596
+ . wrapping_add ( s1)
597
+ . wrapping_add ( C [ i] ) ;
598
+
599
+ array[ 3 ] = array[ 2 ] ;
600
+ array[ 2 ] = array[ 1 ] . rotate_left ( 9 ) ;
601
+ array[ 1 ] = array[ 0 ] ;
602
+ array[ 0 ] = t1;
603
+ array[ 7 ] = array[ 6 ] ;
604
+ array[ 6 ] = array[ 5 ] . rotate_left ( 19 ) ;
605
+ array[ 5 ] = array[ 4 ] ;
606
+ array[ 4 ] = p0 ( t2) ;
607
+ }
608
+
609
+ let e = _mm_setr_epi32 ( array[ 5 ] as _ , array[ 4 ] as _ , array[ 1 ] as _ , array[ 0 ] as _ ) ;
610
+
611
+ assert_eq_m128i ( r, e) ;
612
+ }
613
+
614
+ fn lower_t ( x : u32 ) -> u32 {
615
+ static SBOX : [ u8 ; 256 ] = [
616
+ 0xD6 , 0x90 , 0xE9 , 0xFE , 0xCC , 0xE1 , 0x3D , 0xB7 , 0x16 , 0xB6 , 0x14 , 0xC2 , 0x28 , 0xFB ,
617
+ 0x2C , 0x05 , 0x2B , 0x67 , 0x9A , 0x76 , 0x2A , 0xBE , 0x04 , 0xC3 , 0xAA , 0x44 , 0x13 , 0x26 ,
618
+ 0x49 , 0x86 , 0x06 , 0x99 , 0x9C , 0x42 , 0x50 , 0xF4 , 0x91 , 0xEF , 0x98 , 0x7A , 0x33 , 0x54 ,
619
+ 0x0B , 0x43 , 0xED , 0xCF , 0xAC , 0x62 , 0xE4 , 0xB3 , 0x1C , 0xA9 , 0xC9 , 0x08 , 0xE8 , 0x95 ,
620
+ 0x80 , 0xDF , 0x94 , 0xFA , 0x75 , 0x8F , 0x3F , 0xA6 , 0x47 , 0x07 , 0xA7 , 0xFC , 0xF3 , 0x73 ,
621
+ 0x17 , 0xBA , 0x83 , 0x59 , 0x3C , 0x19 , 0xE6 , 0x85 , 0x4F , 0xA8 , 0x68 , 0x6B , 0x81 , 0xB2 ,
622
+ 0x71 , 0x64 , 0xDA , 0x8B , 0xF8 , 0xEB , 0x0F , 0x4B , 0x70 , 0x56 , 0x9D , 0x35 , 0x1E , 0x24 ,
623
+ 0x0E , 0x5E , 0x63 , 0x58 , 0xD1 , 0xA2 , 0x25 , 0x22 , 0x7C , 0x3B , 0x01 , 0x21 , 0x78 , 0x87 ,
624
+ 0xD4 , 0x00 , 0x46 , 0x57 , 0x9F , 0xD3 , 0x27 , 0x52 , 0x4C , 0x36 , 0x02 , 0xE7 , 0xA0 , 0xC4 ,
625
+ 0xC8 , 0x9E , 0xEA , 0xBF , 0x8A , 0xD2 , 0x40 , 0xC7 , 0x38 , 0xB5 , 0xA3 , 0xF7 , 0xF2 , 0xCE ,
626
+ 0xF9 , 0x61 , 0x15 , 0xA1 , 0xE0 , 0xAE , 0x5D , 0xA4 , 0x9B , 0x34 , 0x1A , 0x55 , 0xAD , 0x93 ,
627
+ 0x32 , 0x30 , 0xF5 , 0x8C , 0xB1 , 0xE3 , 0x1D , 0xF6 , 0xE2 , 0x2E , 0x82 , 0x66 , 0xCA , 0x60 ,
628
+ 0xC0 , 0x29 , 0x23 , 0xAB , 0x0D , 0x53 , 0x4E , 0x6F , 0xD5 , 0xDB , 0x37 , 0x45 , 0xDE , 0xFD ,
629
+ 0x8E , 0x2F , 0x03 , 0xFF , 0x6A , 0x72 , 0x6D , 0x6C , 0x5B , 0x51 , 0x8D , 0x1B , 0xAF , 0x92 ,
630
+ 0xBB , 0xDD , 0xBC , 0x7F , 0x11 , 0xD9 , 0x5C , 0x41 , 0x1F , 0x10 , 0x5A , 0xD8 , 0x0A , 0xC1 ,
631
+ 0x31 , 0x88 , 0xA5 , 0xCD , 0x7B , 0xBD , 0x2D , 0x74 , 0xD0 , 0x12 , 0xB8 , 0xE5 , 0xB4 , 0xB0 ,
632
+ 0x89 , 0x69 , 0x97 , 0x4A , 0x0C , 0x96 , 0x77 , 0x7E , 0x65 , 0xB9 , 0xF1 , 0x09 , 0xC5 , 0x6E ,
633
+ 0xC6 , 0x84 , 0x18 , 0xF0 , 0x7D , 0xEC , 0x3A , 0xDC , 0x4D , 0x20 , 0x79 , 0xEE , 0x5F , 0x3E ,
634
+ 0xD7 , 0xCB , 0x39 , 0x48 ,
635
+ ] ;
636
+
637
+ ( ( SBOX [ ( x >> 24 ) as usize ] as u32 ) << 24 )
638
+ | ( ( SBOX [ ( ( x >> 16 ) & 0xff ) as usize ] as u32 ) << 16 )
639
+ | ( ( SBOX [ ( ( x >> 8 ) & 0xff ) as usize ] as u32 ) << 8 )
640
+ | ( SBOX [ ( x & 0xff ) as usize ] as u32 )
641
+ }
642
+
643
+ #[ simd_test( enable = "sm4,avx" ) ]
644
+ unsafe fn test_mm_sm4key4_epi32 ( ) {
645
+ fn l_key ( x : u32 ) -> u32 {
646
+ x ^ x. rotate_left ( 13 ) ^ x. rotate_left ( 23 )
647
+ }
648
+ fn f_key ( x0 : u32 , x1 : u32 , x2 : u32 , x3 : u32 , rk : u32 ) -> u32 {
649
+ x0 ^ l_key ( lower_t ( x1 ^ x2 ^ x3 ^ rk) )
650
+ }
651
+
652
+ let A = & DATA_32 [ 0 ..4 ] ;
653
+ let B = & DATA_32 [ 4 ..8 ] ;
654
+
655
+ let a = _mm_loadu_si128 ( A . as_ptr ( ) . cast ( ) ) ;
656
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
657
+
658
+ let r = _mm_sm4key4_epi32 ( a, b) ;
659
+
660
+ let e0 = f_key ( A [ 0 ] , A [ 1 ] , A [ 2 ] , A [ 3 ] , B [ 0 ] ) ;
661
+ let e1 = f_key ( A [ 1 ] , A [ 2 ] , A [ 3 ] , e0, B [ 1 ] ) ;
662
+ let e2 = f_key ( A [ 2 ] , A [ 3 ] , e0, e1, B [ 2 ] ) ;
663
+ let e3 = f_key ( A [ 3 ] , e0, e1, e2, B [ 3 ] ) ;
664
+ let e = _mm_setr_epi32 ( e0 as _ , e1 as _ , e2 as _ , e3 as _ ) ;
665
+
666
+ assert_eq_m128i ( r, e) ;
667
+ }
668
+
669
+ #[ simd_test( enable = "sm4,avx" ) ]
670
+ unsafe fn test_mm256_sm4key4_epi32 ( ) {
671
+ let a_low = _mm_loadu_si128 ( DATA_32 . as_ptr ( ) . cast ( ) ) ;
672
+ let a_high = _mm_loadu_si128 ( DATA_32 [ 4 ..] . as_ptr ( ) . cast ( ) ) ;
673
+ let b_low = _mm_loadu_si128 ( DATA_32 [ 8 ..] . as_ptr ( ) . cast ( ) ) ;
674
+ let b_high = _mm_loadu_si128 ( DATA_32 [ 12 ..] . as_ptr ( ) . cast ( ) ) ;
675
+
676
+ let a = _mm256_set_m128i ( a_high, a_low) ;
677
+ let b = _mm256_set_m128i ( b_high, b_low) ;
678
+
679
+ let r = _mm256_sm4key4_epi32 ( a, b) ;
680
+
681
+ let e_low = _mm_sm4key4_epi32 ( a_low, b_low) ;
682
+ let e_high = _mm_sm4key4_epi32 ( a_high, b_high) ;
683
+ let e = _mm256_set_m128i ( e_high, e_low) ;
684
+
685
+ assert_eq_m256i ( r, e) ;
686
+ }
687
+
688
+ #[ simd_test( enable = "sm4,avx" ) ]
689
+ unsafe fn test_mm_sm4rnds4_epi32 ( ) {
690
+ fn l_rnd ( x : u32 ) -> u32 {
691
+ x ^ x. rotate_left ( 2 ) ^ x. rotate_left ( 10 ) ^ x. rotate_left ( 18 ) ^ x. rotate_left ( 24 )
692
+ }
693
+ fn f_rnd ( x0 : u32 , x1 : u32 , x2 : u32 , x3 : u32 , rk : u32 ) -> u32 {
694
+ x0 ^ l_rnd ( lower_t ( x1 ^ x2 ^ x3 ^ rk) )
695
+ }
696
+
697
+ let A = & DATA_32 [ 0 ..4 ] ;
698
+ let B = & DATA_32 [ 4 ..8 ] ;
699
+
700
+ let a = _mm_loadu_si128 ( A . as_ptr ( ) . cast ( ) ) ;
701
+ let b = _mm_loadu_si128 ( B . as_ptr ( ) . cast ( ) ) ;
702
+
703
+ let r = _mm_sm4rnds4_epi32 ( a, b) ;
704
+
705
+ let e0 = f_rnd ( A [ 0 ] , A [ 1 ] , A [ 2 ] , A [ 3 ] , B [ 0 ] ) ;
706
+ let e1 = f_rnd ( A [ 1 ] , A [ 2 ] , A [ 3 ] , e0, B [ 1 ] ) ;
707
+ let e2 = f_rnd ( A [ 2 ] , A [ 3 ] , e0, e1, B [ 2 ] ) ;
708
+ let e3 = f_rnd ( A [ 3 ] , e0, e1, e2, B [ 3 ] ) ;
709
+ let e = _mm_setr_epi32 ( e0 as _ , e1 as _ , e2 as _ , e3 as _ ) ;
710
+
711
+ assert_eq_m128i ( r, e) ;
712
+ }
713
+
714
+ #[ simd_test( enable = "sm4,avx" ) ]
715
+ unsafe fn test_mm256_sm4rnds4_epi32 ( ) {
716
+ let a_low = _mm_loadu_si128 ( DATA_32 . as_ptr ( ) . cast ( ) ) ;
717
+ let a_high = _mm_loadu_si128 ( DATA_32 [ 4 ..] . as_ptr ( ) . cast ( ) ) ;
718
+ let b_low = _mm_loadu_si128 ( DATA_32 [ 8 ..] . as_ptr ( ) . cast ( ) ) ;
719
+ let b_high = _mm_loadu_si128 ( DATA_32 [ 12 ..] . as_ptr ( ) . cast ( ) ) ;
720
+
721
+ let a = _mm256_set_m128i ( a_high, a_low) ;
722
+ let b = _mm256_set_m128i ( b_high, b_low) ;
723
+
724
+ let r = _mm256_sm4rnds4_epi32 ( a, b) ;
725
+
726
+ let e_low = _mm_sm4rnds4_epi32 ( a_low, b_low) ;
727
+ let e_high = _mm_sm4rnds4_epi32 ( a_high, b_high) ;
728
+ let e = _mm256_set_m128i ( e_high, e_low) ;
729
+
730
+ assert_eq_m256i ( r, e) ;
731
+ }
218
732
}
0 commit comments