Skip to content

Commit 1a64574

Browse files
tpaintgopherbot
authored andcommitted
crypto/sha256: add sha-ni implementation
goos: linux goarch: amd64 pkg: crypto/sha256 cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-4 169.20n ± 7% 65.40n ± 5% -61.35% (p=0.000 n=10) Hash8Bytes/Sum224-4 166.10n ± 3% 65.20n ± 8% -60.74% (p=0.000 n=10) Hash8Bytes/Sum256-4 168.50n ± 6% 63.58n ± 7% -62.27% (p=0.000 n=10) Hash1K/New-4 2275.5n ± 5% 618.5n ± 2% -72.82% (p=0.000 n=10) Hash1K/Sum224-4 2364.5n ± 1% 618.1n ± 1% -73.86% (p=0.000 n=10) Hash1K/Sum256-4 2338.5n ± 2% 613.0n ± 2% -73.79% (p=0.000 n=10) Hash8K/New-4 17.530µ ± 2% 4.501µ ± 1% -74.33% (p=0.000 n=10) Hash8K/Sum224-4 17.456µ ± 2% 4.505µ ± 1% -74.19% (p=0.000 n=10) Hash8K/Sum256-4 17.417µ ± 2% 4.504µ ± 1% -74.14% (p=0.000 n=10) geomean 1.897µ 564.3n -70.25% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New-4 45.11Mi ± 6% 116.66Mi ± 5% +158.62% (p=0.000 n=10) Hash8Bytes/Sum224-4 45.92Mi ± 3% 117.04Mi ± 8% +154.89% (p=0.000 n=10) Hash8Bytes/Sum256-4 45.29Mi ± 6% 120.00Mi ± 7% +164.99% (p=0.000 n=10) Hash1K/New-4 429.2Mi ± 5% 1578.9Mi ± 2% +267.92% (p=0.000 n=10) Hash1K/Sum224-4 413.0Mi ± 1% 1579.8Mi ± 1% +282.49% (p=0.000 n=10) Hash1K/Sum256-4 417.6Mi ± 1% 1593.1Mi ± 2% +281.53% (p=0.000 n=10) Hash8K/New-4 445.7Mi ± 1% 1735.9Mi ± 1% +289.50% (p=0.000 n=10) Hash8K/Sum224-4 447.6Mi ± 2% 1734.5Mi ± 1% +287.54% (p=0.000 n=10) Hash8K/Sum256-4 448.6Mi ± 2% 1734.8Mi ± 1% +286.75% (p=0.000 n=10) geomean 204.3Mi 686.8Mi +236.11% │ bench.old │ bench.new │ │ B/op │ B/op vs base │ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ bench.old │ bench.new │ │ allocs/op │ allocs/op vs base │ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean Fixes #50543. Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0 Reviewed-on: https://go-review.googlesource.com/c/go/+/408795 Reviewed-by: Paulo Gomes <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Run-TryBot: Russ Cox <[email protected]> Reviewed-by: Alan Donovan <[email protected]> Auto-Submit: Russ Cox <[email protected]> Reviewed-by: Russ Cox <[email protected]>
1 parent e29dd78 commit 1a64574

File tree

2 files changed

+152
-9
lines changed

2 files changed

+152
-9
lines changed

src/crypto/sha256/sha256block_amd64.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ package sha256
77
import "internal/cpu"
88

99
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
10+
var useSHA = useAVX2 && cpu.X86.HasSHA

src/crypto/sha256/sha256block_amd64.s

Lines changed: 151 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@
179179

180180
#define XFER Y9
181181

182-
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
182+
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
183183
#define X_BYTE_FLIP_MASK X13
184184

185185
#define NUM_BYTES DX
@@ -232,14 +232,14 @@
232232
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
233233
; \
234234
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
235-
XORL g, y2; \ // y2 = f^g // CH
235+
XORL g, y2; \ // y2 = f^g // CH
236236
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
237237
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
238238
; \
239239
ANDL e, y2; \ // y2 = (f^g)&e // CH
240240
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
241241
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
242-
ADDL h, d; \ // d = k + w + h + d // --
242+
ADDL h, d; \ // d = k + w + h + d // --
243243
; \
244244
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
245245
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
@@ -270,7 +270,7 @@
270270
MOVL a, y3; \ // y3 = a // MAJA
271271
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
272272
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
273-
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
273+
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
274274
ORL c, y3; \ // y3 = a|c // MAJA
275275
; \
276276
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
@@ -316,7 +316,7 @@
316316
; \
317317
MOVL a, y3; \ // y3 = a // MAJA
318318
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
319-
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
319+
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
320320
; \
321321
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
322322
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
@@ -495,7 +495,7 @@
495495
; \
496496
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
497497
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
498-
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
498+
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
499499
ORL c, y3; \ // y3 = a|c // MAJA
500500
; \
501501
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
@@ -531,7 +531,7 @@
531531
; \
532532
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
533533
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
534-
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
534+
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
535535
ORL c, y3; \ // y3 = a|c // MAJA
536536
; \
537537
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
@@ -550,9 +550,80 @@
550550
; \
551551
ADDL y3, h // h = t1 + S0 + MAJ // --
552552

553+
// Definitions for sha-ni version
554+
//
555+
// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
556+
// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
557+
//
558+
// Reference
559+
// S. Gulley, et al, "New Instructions Supporting the Secure Hash
560+
// Algorithm on Intel® Architecture Processors", July 2013
561+
// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
562+
//
563+
564+
#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
565+
#define dataPtr SI // input, base pointer to first input data block
566+
#define numBytes DX // input, number of input bytes to be processed
567+
#define sha256Constants AX // round contants from K256 table, indexed by round number x 32
568+
#define msg X0 // input data
569+
#define state0 X1 // round intermediates and outputs
570+
#define state1 X2
571+
#define m0 X3 // m0, m1,... m4 -- round message temps
572+
#define m1 X4
573+
#define m2 X5
574+
#define m3 X6
575+
#define m4 X7
576+
#define shufMask X8 // input data endian conversion control mask
577+
#define abefSave X9 // digest hash vector inter-block buffer abef
578+
#define cdghSave X10 // digest hash vector inter-block buffer cdgh
579+
580+
#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds
581+
582+
#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
583+
SHA256MSG1 m, a
584+
585+
#define vmov(a,b) \ // msg copy for all but rounds 12-15
586+
VMOVDQA a, b
587+
588+
#define vmovrev(a,b) \ // reverse copy for rounds 12-15
589+
VMOVDQA b, a
590+
591+
// sha rounds 0 to 11
592+
// identical with the exception of the final msg op
593+
// which is replaced with a nop for rounds where it is not needed
594+
// refer to Gulley, et al for more information
595+
#define rounds0to11(m,a,c,sha256Msg1) \
596+
VMOVDQU c*16(dataPtr), msg \
597+
PSHUFB shufMask, msg \
598+
VMOVDQA msg, m \
599+
PADDD (c*32)(sha256Constants), msg \
600+
SHA256RNDS2 msg, state0, state1 \
601+
PSHUFD $0x0e, msg, msg \
602+
SHA256RNDS2 msg, state1, state0 \
603+
sha256Msg1 (m,a)
604+
605+
// sha rounds 12 to 59
606+
// identical with the exception of the final msg op
607+
// and the reverse copy(m,msg) in round 12 which is required
608+
// after the last data load
609+
// refer to Gulley, et al for more information
610+
#define rounds12to59(m,c,a,t,sha256Msg1,movop) \
611+
movop (m,msg) \
612+
PADDD (c*32)(sha256Constants), msg \
613+
SHA256RNDS2 msg, state0, state1 \
614+
VMOVDQA m, m4 \
615+
PALIGNR $4, a, m4 \
616+
PADDD m4, t \
617+
SHA256MSG2 m, t \
618+
PSHUFD $0x0e, msg, msg \
619+
SHA256RNDS2 msg, state1, state0 \
620+
sha256Msg1 (m,a)
621+
553622
TEXT ·block(SB), 0, $536-32
554-
CMPB ·useAVX2(SB), $1
555-
JE avx2
623+
CMPB ·useSHA(SB), $1
624+
JE sha_ni
625+
CMPB ·useAVX2(SB), $1
626+
JE avx2
556627

557628
MOVQ p_base+8(FP), SI
558629
MOVQ p_len+16(FP), DX
@@ -862,6 +933,77 @@ done_hash:
862933
VZEROUPPER
863934
RET
864935

936+
sha_ni:
937+
MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
938+
MOVQ p_base+8(FP), dataPtr // init input data base pointer
939+
MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
940+
SHRQ $6, numBytes // force modulo 64 input buffer length
941+
SHLQ $6, numBytes
942+
CMPQ numBytes, $0 // exit early for zero-length input buffer
943+
JEQ done
944+
ADDQ dataPtr, numBytes // point numBytes to end of input buffer
945+
VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
946+
VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
947+
PSHUFD $0xb1, state0, state0 // CDAB
948+
PSHUFD $0x1b, state1, state1 // EFGH
949+
VMOVDQA state0, m4
950+
PALIGNR $8, state1, state0 // ABEF
951+
PBLENDW $0xf0, m4, state1 // CDGH
952+
VMOVDQA flip_mask<>(SB), shufMask
953+
LEAQ K256<>(SB), sha256Constants
954+
955+
roundLoop:
956+
// save hash values for addition after rounds
957+
VMOVDQA state0, abefSave
958+
VMOVDQA state1, cdghSave
959+
960+
// do rounds 0-59
961+
rounds0to11 (m0,-,0,nop) // 0-3
962+
rounds0to11 (m1,m0,1,sha256msg1) // 4-7
963+
rounds0to11 (m2,m1,2,sha256msg1) // 8-11
964+
VMOVDQU (3*16)(dataPtr), msg
965+
PSHUFB shufMask, msg
966+
rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
967+
rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19
968+
rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23
969+
rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27
970+
rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31
971+
rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35
972+
rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39
973+
rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43
974+
rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47
975+
rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51
976+
rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55
977+
rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59
978+
979+
// do rounds 60-63
980+
VMOVDQA m3, msg
981+
PADDD (15*32)(sha256Constants), msg
982+
SHA256RNDS2 msg, state0, state1
983+
PSHUFD $0x0e, msg, msg
984+
SHA256RNDS2 msg, state1, state0
985+
986+
// add current hash values with previously saved
987+
PADDD abefSave, state0
988+
PADDD cdghSave, state1
989+
990+
// advance data pointer; loop until buffer empty
991+
ADDQ $64, dataPtr
992+
CMPQ numBytes, dataPtr
993+
JNE roundLoop
994+
995+
// write hash values back in the correct order
996+
PSHUFD $0x1b, state0, state0 // FEBA
997+
PSHUFD $0xb1, state1, state1 // DCHG
998+
VMOVDQA state0, m4
999+
PBLENDW $0xf0, state1, state0 // DCBA
1000+
PALIGNR $8, m4, state1 // HGFE
1001+
VMOVDQU state0, (0*16)(digestPtr)
1002+
VMOVDQU state1, (1*16)(digestPtr)
1003+
1004+
done:
1005+
RET
1006+
8651007
// shuffle byte order from LE to BE
8661008
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
8671009
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b

0 commit comments

Comments
 (0)