Skip to content

Commit 080b167

Browse files
committed
sha3: Add SIMD implementation on ARMv8
On ARMv8 four SIMD instructions, EOR3, RAX1, XAR, BCAX are added to accelerate sha3 operations. Here the SIMD version of sha3 on ARMv8 is added. Compare to the pure golang implementation (the implementation in keccakf.go), the time difference is listed in the following benchmark old ns/op new ns/op delta BenchmarkPermutationFunction-8 227.0 ns/op 153.6 ns/op -32.33% BenchmarkSha3_512_MTU-8 4954 ns/op 3296 ns/op -33.47% BenchmarkSha3_384_MTU-8 3586 ns/op 2441 ns/op -31.93% BenchmarkSha3_256_MTU-8 2909 ns/op 1982 ns/op -31.87% BenchmarkSha3_224_MTU-8 2779 ns/op 1905 ns/op -31.45% BenchmarkShake128_MTU-8 2326 ns/op 1588 ns/op -31.73% BenchmarkShake256_MTU-8 2485 ns/op 1670 ns/op -32.80% BenchmarkShake256_16x-8 37052 ns/op 26715 ns/op -27.90% BenchmarkShake256_1MiB-8 1911863 ns/op 1293014 ns/op -32.37% BenchmarkSha3_512_1MiB-8 3496335 ns/op 2317853 ns/op -33.71% benchmark old MB/s new MB/s speedup BenchmarkPermutationFunction-8 881.22 MB/s 1302.48 MB/s 1.48x BenchmarkSha3_512_MTU-8 272.50 MB/s 409.64 MB/s 1.50x BenchmarkSha3_384_MTU-8 376.47 MB/s 553.06 MB/s 1.47x BenchmarkSha3_256_MTU-8 464.11 MB/s 681.27 MB/s 1.47x BenchmarkSha3_224_MTU-8 485.75 MB/s 708.83 MB/s 1.46x BenchmarkShake128_MTU-8 580.32 MB/s 849.97 MB/s 1.46x BenchmarkShake256_MTU-8 543.34 MB/s 808.53 MB/s 1.49x BenchmarkShake256_16x-8 442.19 MB/s 613.29 MB/s 1.39x BenchmarkShake256_1MiB-8 548.46 MB/s 810.95 MB/s 1.48x BenchmarkSha3_512_1MiB-8 299.91 MB/s 452.39 MB/s 1.51x
1 parent 4f45737 commit 080b167

File tree

4 files changed

+194
-4
lines changed

4 files changed

+194
-4
lines changed

sha3/keccakf.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 || purego || !gc
6-
// +build !amd64 purego !gc
5+
//go:build !amd64 || purego || !gc || !arm64
6+
// +build !amd64 purego !gc !arm64
77

88
package sha3
99

@@ -35,9 +35,9 @@ var rc = [24]uint64{
3535
0x8000000080008008,
3636
}
3737

38-
// keccakF1600 applies the Keccak permutation to a 1600b-wide
38+
// keccakF1600Generic applies the Keccak permutation to a 1600b-wide
3939
// state represented as a slice of 25 uint64s.
40-
func keccakF1600(a *[25]uint64) {
40+
func keccakF1600Generic(a *[25]uint64) {
4141
// Implementation translated from Keccak-inplace.c
4242
// in the keccak reference code.
4343
var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64

sha3/keccakf_arm64.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build arm64
6+
// +build arm64
7+
8+
package sha3
9+
10+
// This function is implemented in keccakf_arm64.s.
11+
// For ARMv8 machines GOARM=n/a, and GOARCH=arm64
12+
// see https://github.com/golang/go/wiki/GoArm
13+
//go:noescape
14+
func keccakF1600(a *[25]uint64)

sha3/keccakf_arm64.s

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build arm64
6+
// +build arm64
7+
8+
#include "textflag.h"
9+
10+
// func keccakF1600(a *[25]uint64)
11+
TEXT ·keccakF1600(SB),$0-24
12+
MOVD a+0(FP), R0
13+
MOVD $round_consts(SB), R1
14+
MOVD $24, R2 // counter for loop
15+
16+
VLD1.P 16(R0), [V0.D1, V1.D1]
17+
VLD1.P 16(R0), [V2.D1, V3.D1]
18+
VLD1.P 16(R0), [V4.D1, V5.D1]
19+
VLD1.P 16(R0), [V6.D1, V7.D1]
20+
VLD1.P 16(R0), [V8.D1, V9.D1]
21+
VLD1.P 16(R0), [V10.D1, V11.D1]
22+
VLD1.P 16(R0), [V12.D1, V13.D1]
23+
VLD1.P 16(R0), [V14.D1, V15.D1]
24+
VLD1.P 16(R0), [V16.D1, V17.D1]
25+
VLD1.P 16(R0), [V18.D1, V19.D1]
26+
VLD1.P 16(R0), [V20.D1, V21.D1]
27+
VLD1.P 16(R0), [V22.D1, V23.D1]
28+
VLD1 (R0), [V24.D1]
29+
30+
SUB $192, R0, R0
31+
32+
loop:
33+
// theta
34+
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
35+
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
36+
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
37+
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
38+
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
39+
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
40+
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
41+
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
42+
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
43+
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
44+
45+
VRAX1 V27.D2, V25.D2, V30.D2
46+
VRAX1 V28.D2, V26.D2, V31.D2
47+
VRAX1 V29.D2, V27.D2, V27.D2
48+
VRAX1 V25.D2, V28.D2, V28.D2
49+
VRAX1 V26.D2, V29.D2, V29.D2
50+
51+
// theta + rho + Pi
52+
VXAR $64-1, V30.D2, V1.D2, V25.D2
53+
54+
VXAR $64-44, V30.D2, V6.D2, V1.D2
55+
VXAR $64-20, V28.D2, V9.D2, V6.D2
56+
VXAR $64-61, V31.D2, V22.D2, V9.D2
57+
VXAR $64-39, V28.D2, V14.D2, V22.D2
58+
VXAR $64-18, V29.D2, V20.D2, V14.D2
59+
60+
VXAR $64-62, V31.D2, V2.D2, V26.D2
61+
62+
VXAR $64-43, V31.D2, V12.D2, V2.D2
63+
VXAR $64-25, V27.D2, V13.D2, V12.D2
64+
VXAR $64-8, V28.D2, V19.D2, V13.D2
65+
VXAR $64-56, V27.D2, V23.D2, V19.D2
66+
VXAR $64-41, V29.D2, V15.D2, V23.D2
67+
68+
VXAR $64-27, V28.D2, V4.D2, V15.D2
69+
70+
VXAR $64-14, V28.D2, V24.D2, V28.D2
71+
VXAR $64-2, V30.D2, V21.D2, V24.D2
72+
VXAR $64-55, V27.D2, V8.D2, V8.D2
73+
VXAR $64-45, V30.D2, V16.D2, V4.D2
74+
VXAR $64-36, V29.D2, V5.D2, V16.D2
75+
76+
VXAR $64-28, V27.D2, V3.D2, V5.D2
77+
78+
VEOR V29.B16, V0.B16, V0.B16
79+
80+
VXAR $64-21, V27.D2, V18.D2, V27.D2
81+
VXAR $64-15, V31.D2, V17.D2, V3.D2
82+
VXAR $64-10, V30.D2, V11.D2, V30.D2
83+
VXAR $64-6, V31.D2, V7.D2, V31.D2
84+
VXAR $64-3, V29.D2, V10.D2, V29.D2
85+
86+
// chi + iota
87+
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
88+
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
89+
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
90+
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
91+
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
92+
93+
VLD1R.P 8(R1), [V26.D2]
94+
95+
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
96+
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
97+
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
98+
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
99+
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
100+
101+
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
102+
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
103+
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
104+
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
105+
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
106+
107+
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
108+
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
109+
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
110+
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
111+
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
112+
113+
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
114+
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
115+
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (for chi part)
116+
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
117+
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
118+
119+
VEOR V26.B16, V0.B16, V0.B16 // iota
120+
121+
SUBS $1, R2, R2
122+
BNE loop
123+
124+
VST1.P [V0.D1, V1.D1], 16(R0)
125+
VST1.P [V2.D1, V3.D1], 16(R0)
126+
VST1.P [V4.D1, V5.D1], 16(R0)
127+
VST1.P [V6.D1, V7.D1], 16(R0)
128+
VST1.P [V8.D1, V9.D1], 16(R0)
129+
VST1.P [V10.D1, V11.D1], 16(R0)
130+
VST1.P [V12.D1, V13.D1], 16(R0)
131+
VST1.P [V14.D1, V15.D1], 16(R0)
132+
VST1.P [V16.D1, V17.D1], 16(R0)
133+
VST1.P [V18.D1, V19.D1], 16(R0)
134+
VST1.P [V20.D1, V21.D1], 16(R0)
135+
VST1.P [V22.D1, V23.D1], 16(R0)
136+
VST1 [V24.D1], (R0)
137+
138+
RET
139+
140+
DATA round_consts+0x00(SB)/8, $0x0000000000000001
141+
DATA round_consts+0x08(SB)/8, $0x0000000000008082
142+
DATA round_consts+0x10(SB)/8, $0x800000000000808a
143+
DATA round_consts+0x18(SB)/8, $0x8000000080008000
144+
DATA round_consts+0x20(SB)/8, $0x000000000000808b
145+
DATA round_consts+0x28(SB)/8, $0x0000000080000001
146+
DATA round_consts+0x30(SB)/8, $0x8000000080008081
147+
DATA round_consts+0x38(SB)/8, $0x8000000000008009
148+
DATA round_consts+0x40(SB)/8, $0x000000000000008a
149+
DATA round_consts+0x48(SB)/8, $0x0000000000000088
150+
DATA round_consts+0x50(SB)/8, $0x0000000080008009
151+
DATA round_consts+0x58(SB)/8, $0x000000008000000a
152+
DATA round_consts+0x60(SB)/8, $0x000000008000808b
153+
DATA round_consts+0x68(SB)/8, $0x800000000000008b
154+
DATA round_consts+0x70(SB)/8, $0x8000000000008089
155+
DATA round_consts+0x78(SB)/8, $0x8000000000008003
156+
DATA round_consts+0x80(SB)/8, $0x8000000000008002
157+
DATA round_consts+0x88(SB)/8, $0x8000000000000080
158+
DATA round_consts+0x90(SB)/8, $0x000000000000800a
159+
DATA round_consts+0x98(SB)/8, $0x800000008000000a
160+
DATA round_consts+0xA0(SB)/8, $0x8000000080008081
161+
DATA round_consts+0xA8(SB)/8, $0x8000000000008080
162+
DATA round_consts+0xB0(SB)/8, $0x0000000080000001
163+
DATA round_consts+0xB8(SB)/8, $0x8000000080008008
164+
GLOBL round_consts(SB), (8+16), $192

sha3/keccakf_noasm.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego || !arm64
6+
// +build !arm64,!s390x,!ppc64le !gc purego !arm64
7+
8+
package sha3
9+
10+
func keccakF1600(a *[25]uint64) {
11+
keccakF1600Generic(a)
12+
}

0 commit comments

Comments
 (0)