|
179 | 179 |
|
180 | 180 | #define XFER Y9
|
181 | 181 |
|
182 |
| -#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE |
| 182 | +#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE |
183 | 183 | #define X_BYTE_FLIP_MASK X13
|
184 | 184 |
|
185 | 185 | #define NUM_BYTES DX
|
|
232 | 232 | RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
|
233 | 233 | ; \
|
234 | 234 | XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
|
235 |
| - XORL g, y2; \ // y2 = f^g // CH |
| 235 | + XORL g, y2; \ // y2 = f^g // CH |
236 | 236 | VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
|
237 | 237 | RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
|
238 | 238 | ; \
|
239 | 239 | ANDL e, y2; \ // y2 = (f^g)&e // CH
|
240 | 240 | XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
|
241 | 241 | RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
|
242 |
| - ADDL h, d; \ // d = k + w + h + d // -- |
| 242 | + ADDL h, d; \ // d = k + w + h + d // -- |
243 | 243 | ; \
|
244 | 244 | ANDL b, y3; \ // y3 = (a|c)&b // MAJA
|
245 | 245 | VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
|
|
270 | 270 | MOVL a, y3; \ // y3 = a // MAJA
|
271 | 271 | RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
|
272 | 272 | RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
|
273 |
| - ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
| 273 | + ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
274 | 274 | ORL c, y3; \ // y3 = a|c // MAJA
|
275 | 275 | ; \
|
276 | 276 | VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
|
|
316 | 316 | ; \
|
317 | 317 | MOVL a, y3; \ // y3 = a // MAJA
|
318 | 318 | RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
|
319 |
| - ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
| 319 | + ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
320 | 320 | ; \
|
321 | 321 | VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
|
322 | 322 | RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
|
|
495 | 495 | ; \
|
496 | 496 | XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
|
497 | 497 | RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
|
498 |
| - ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
| 498 | + ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
499 | 499 | ORL c, y3; \ // y3 = a|c // MAJA
|
500 | 500 | ; \
|
501 | 501 | XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
|
|
531 | 531 | ; \
|
532 | 532 | XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
|
533 | 533 | RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
|
534 |
| - ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
| 534 | + ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- |
535 | 535 | ORL c, y3; \ // y3 = a|c // MAJA
|
536 | 536 | ; \
|
537 | 537 | XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
|
|
550 | 550 | ; \
|
551 | 551 | ADDL y3, h // h = t1 + S0 + MAJ // --
|
552 | 552 |
|
| 553 | +// Definitions for sha-ni version |
| 554 | +// |
| 555 | +// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 |
| 556 | +// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version |
| 557 | +// |
| 558 | +// Reference |
| 559 | +// S. Gulley, et al, "New Instructions Supporting the Secure Hash |
| 560 | +// Algorithm on Intel® Architecture Processors", July 2013 |
| 561 | +// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html |
| 562 | +// |
| 563 | + |
| 564 | +#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7 |
| 565 | +#define dataPtr SI // input, base pointer to first input data block |
| 566 | +#define numBytes DX // input, number of input bytes to be processed |
| 567 | +#define sha256Constants AX // round contants from K256 table, indexed by round number x 32 |
| 568 | +#define msg X0 // input data |
| 569 | +#define state0 X1 // round intermediates and outputs |
| 570 | +#define state1 X2 |
| 571 | +#define m0 X3 // m0, m1,... m4 -- round message temps |
| 572 | +#define m1 X4 |
| 573 | +#define m2 X5 |
| 574 | +#define m3 X6 |
| 575 | +#define m4 X7 |
| 576 | +#define shufMask X8 // input data endian conversion control mask |
| 577 | +#define abefSave X9 // digest hash vector inter-block buffer abef |
| 578 | +#define cdghSave X10 // digest hash vector inter-block buffer cdgh |
| 579 | + |
| 580 | +#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds |
| 581 | + |
| 582 | +#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it |
| 583 | + SHA256MSG1 m, a |
| 584 | + |
| 585 | +#define vmov(a,b) \ // msg copy for all but rounds 12-15 |
| 586 | + VMOVDQA a, b |
| 587 | + |
| 588 | +#define vmovrev(a,b) \ // reverse copy for rounds 12-15 |
| 589 | + VMOVDQA b, a |
| 590 | + |
| 591 | +// sha rounds 0 to 11 |
| 592 | +// identical with the exception of the final msg op |
| 593 | +// which is replaced with a nop for rounds where it is not needed |
| 594 | +// refer to Gulley, et al for more information |
| 595 | +#define rounds0to11(m,a,c,sha256Msg1) \ |
| 596 | + VMOVDQU c*16(dataPtr), msg \ |
| 597 | + PSHUFB shufMask, msg \ |
| 598 | + VMOVDQA msg, m \ |
| 599 | + PADDD (c*32)(sha256Constants), msg \ |
| 600 | + SHA256RNDS2 msg, state0, state1 \ |
| 601 | + PSHUFD $0x0e, msg, msg \ |
| 602 | + SHA256RNDS2 msg, state1, state0 \ |
| 603 | + sha256Msg1 (m,a) |
| 604 | + |
| 605 | +// sha rounds 12 to 59 |
| 606 | +// identical with the exception of the final msg op |
| 607 | +// and the reverse copy(m,msg) in round 12 which is required |
| 608 | +// after the last data load |
| 609 | +// refer to Gulley, et al for more information |
| 610 | +#define rounds12to59(m,c,a,t,sha256Msg1,movop) \ |
| 611 | + movop (m,msg) \ |
| 612 | + PADDD (c*32)(sha256Constants), msg \ |
| 613 | + SHA256RNDS2 msg, state0, state1 \ |
| 614 | + VMOVDQA m, m4 \ |
| 615 | + PALIGNR $4, a, m4 \ |
| 616 | + PADDD m4, t \ |
| 617 | + SHA256MSG2 m, t \ |
| 618 | + PSHUFD $0x0e, msg, msg \ |
| 619 | + SHA256RNDS2 msg, state1, state0 \ |
| 620 | + sha256Msg1 (m,a) |
| 621 | + |
553 | 622 | TEXT ·block(SB), 0, $536-32
|
554 |
| - CMPB ·useAVX2(SB), $1 |
555 |
| - JE avx2 |
| 623 | + CMPB ·useSHA(SB), $1 |
| 624 | + JE sha_ni |
| 625 | + CMPB ·useAVX2(SB), $1 |
| 626 | + JE avx2 |
556 | 627 |
|
557 | 628 | MOVQ p_base+8(FP), SI
|
558 | 629 | MOVQ p_len+16(FP), DX
|
@@ -862,6 +933,77 @@ done_hash:
|
862 | 933 | VZEROUPPER
|
863 | 934 | RET
|
864 | 935 |
|
| 936 | +sha_ni: |
| 937 | + MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer |
| 938 | + MOVQ p_base+8(FP), dataPtr // init input data base pointer |
| 939 | + MOVQ p_len+16(FP), numBytes // get number of input bytes to hash |
| 940 | + SHRQ $6, numBytes // force modulo 64 input buffer length |
| 941 | + SHLQ $6, numBytes |
| 942 | + CMPQ numBytes, $0 // exit early for zero-length input buffer |
| 943 | + JEQ done |
| 944 | + ADDQ dataPtr, numBytes // point numBytes to end of input buffer |
| 945 | + VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder |
| 946 | + VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH |
| 947 | + PSHUFD $0xb1, state0, state0 // CDAB |
| 948 | + PSHUFD $0x1b, state1, state1 // EFGH |
| 949 | + VMOVDQA state0, m4 |
| 950 | + PALIGNR $8, state1, state0 // ABEF |
| 951 | + PBLENDW $0xf0, m4, state1 // CDGH |
| 952 | + VMOVDQA flip_mask<>(SB), shufMask |
| 953 | + LEAQ K256<>(SB), sha256Constants |
| 954 | + |
| 955 | +roundLoop: |
| 956 | + // save hash values for addition after rounds |
| 957 | + VMOVDQA state0, abefSave |
| 958 | + VMOVDQA state1, cdghSave |
| 959 | + |
| 960 | + // do rounds 0-59 |
| 961 | + rounds0to11 (m0,-,0,nop) // 0-3 |
| 962 | + rounds0to11 (m1,m0,1,sha256msg1) // 4-7 |
| 963 | + rounds0to11 (m2,m1,2,sha256msg1) // 8-11 |
| 964 | + VMOVDQU (3*16)(dataPtr), msg |
| 965 | + PSHUFB shufMask, msg |
| 966 | + rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15 |
| 967 | + rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19 |
| 968 | + rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23 |
| 969 | + rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27 |
| 970 | + rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31 |
| 971 | + rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35 |
| 972 | + rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39 |
| 973 | + rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43 |
| 974 | + rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47 |
| 975 | + rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51 |
| 976 | + rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55 |
| 977 | + rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59 |
| 978 | + |
| 979 | + // do rounds 60-63 |
| 980 | + VMOVDQA m3, msg |
| 981 | + PADDD (15*32)(sha256Constants), msg |
| 982 | + SHA256RNDS2 msg, state0, state1 |
| 983 | + PSHUFD $0x0e, msg, msg |
| 984 | + SHA256RNDS2 msg, state1, state0 |
| 985 | + |
| 986 | + // add current hash values with previously saved |
| 987 | + PADDD abefSave, state0 |
| 988 | + PADDD cdghSave, state1 |
| 989 | + |
| 990 | + // advance data pointer; loop until buffer empty |
| 991 | + ADDQ $64, dataPtr |
| 992 | + CMPQ numBytes, dataPtr |
| 993 | + JNE roundLoop |
| 994 | + |
| 995 | + // write hash values back in the correct order |
| 996 | + PSHUFD $0x1b, state0, state0 // FEBA |
| 997 | + PSHUFD $0xb1, state1, state1 // DCHG |
| 998 | + VMOVDQA state0, m4 |
| 999 | + PBLENDW $0xf0, state1, state0 // DCBA |
| 1000 | + PALIGNR $8, m4, state1 // HGFE |
| 1001 | + VMOVDQU state0, (0*16)(digestPtr) |
| 1002 | + VMOVDQU state1, (1*16)(digestPtr) |
| 1003 | + |
| 1004 | +done: |
| 1005 | + RET |
| 1006 | + |
865 | 1007 | // shuffle byte order from LE to BE
|
866 | 1008 | DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
867 | 1009 | DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
|
0 commit comments