Skip to content

Commit 3b1cf3e

Browse files
committed
sha3: Add SIMD implementation on ARMv8
On ARMv8 four SIMD instructions, EOR3, RAX1, XAR, BCAX are added to accelerate sha3 operations. Here the SIMD version of sha3 on ARMv8 is added.
1 parent 4f45737 commit 3b1cf3e

File tree

4 files changed

+204
-2
lines changed

4 files changed

+204
-2
lines changed

sha3/keccakf.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ var rc = [24]uint64{
3535
0x8000000080008008,
3636
}
3737

38-
// keccakF1600 applies the Keccak permutation to a 1600b-wide
38+
// keccakF1600Generic applies the Keccak permutation to a 1600b-wide
3939
// state represented as a slice of 25 uint64s.
40-
func keccakF1600(a *[25]uint64) {
40+
func keccakF1600Generic(a *[25]uint64) {
4141
// Implementation translated from Keccak-inplace.c
4242
// in the keccak reference code.
4343
var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64

sha3/keccakf_arm64.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build arm
6+
// +build arm
7+
8+
package sha3
9+
10+
import _ "unsafe"
11+
12+
// This function is implemented in keccakf_arm64.s.
13+
//go:linkname goarm runtime.goarm
14+
var goarm uint8
15+
16+
//go:noescape
17+
func keccakF1600armv8(a *[25]uint64)
18+
19+
func keccakF1600(a *[25]uint64) {
20+
if goarm >= 8 {
21+
keccakF1600armv8(a)
22+
} else {
23+
keccakF1600Generic(a)
24+
}
25+
}

sha3/keccakf_arm64.s

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build arm
6+
// +build arm
7+
8+
#include "textflag.h"
9+
10+
// func keccakF1600armv8(a *[25]uint64)
11+
TEXT keccakF1600armv8(SB),$0-24
12+
MOVD a+0(FP), R0
13+
MOVD $round_consts(SB), R1
14+
MOVD $24, R2 // counter for loop
15+
16+
VLD1.P 16(R0), [V0.D1, V1.D1]
17+
VLD1.P 16(R0), [V2.D1, V3.D1]
18+
VLD1.P 16(R0), [V4.D1, V5.D1]
19+
VLD1.P 16(R0), [V6.D1, V7.D1]
20+
VLD1.P 16(R0), [V8.D1, V9.D1]
21+
VLD1.P 16(R0), [V10.D1, V11.D1]
22+
VLD1.P 16(R0), [V12.D1, V13.D1]
23+
VLD1.P 16(R0), [V14.D1, V15.D1]
24+
VLD1.P 16(R0), [V16.D1, V17.D1]
25+
VLD1.P 16(R0), [V18.D1, V19.D1]
26+
VLD1.P 16(R0), [V20.D1, V21.D1]
27+
VLD1.P 16(R0), [V22.D1, V23.D1]
28+
VLD1 (R0), [V24.D1]
29+
30+
SUB $192, R0, R0
31+
32+
loop:
33+
// theta
34+
VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
35+
VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
36+
VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
37+
VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
38+
VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
39+
VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
40+
VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
41+
VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
42+
VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
43+
VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
44+
45+
VRAX1 V27.D2, V25.D2, V30.D2
46+
VRAX1 V28.D2, V26.D2, V31.D2
47+
VRAX1 V29.D2, V27.D2, V27.D2
48+
VRAX1 V25.D2, V28.D2, V28.D2
49+
VRAX1 V26.D2, V29.D2, V29.D2
50+
51+
// theta + rho + Pi
52+
VXAR $64-1, V30.D2, V1.D2, V25.D2
53+
54+
VXAR $64-44, V30.D2, V6.D2, V1.D2
55+
VXAR $64-20, V28.D2, V9.D2, V6.D2
56+
VXAR $64-61, V31.D2, V22.D2, V9.D2
57+
VXAR $64-39, V28.D2, V14.D2, V22.D2
58+
VXAR $64-18, V29.D2, V20.D2, V14.D2
59+
60+
VXAR $64-62, V31.D2, V2.D2, V26.D2
61+
62+
VXAR $64-43, V31.D2, V12.D2, V2.D2
63+
VXAR $64-25, V27.D2, V13.D2, V12.D2
64+
VXAR $64-8, V28.D2, V19.D2, V13.D2
65+
VXAR $64-56, V27.D2, V23.D2, V19.D2
66+
VXAR $64-41, V29.D2, V15.D2, V23.D2
67+
68+
VXAR $64-27, V28.D2, V4.D2, V15.D2
69+
70+
VXAR $64-14, V28.D2, V24.D2, V28.D2
71+
VXAR $64-2, V30.D2, V21.D2, V24.D2
72+
VXAR $64-55, V27.D2, V8.D2, V8.D2
73+
VXAR $64-45, V30.D2, V16.D2, V4.D2
74+
VXAR $64-36, V29.D2, V5.D2, V16.D2
75+
76+
VXAR $64-28, V27.D2, V3.D2, V5.D2
77+
78+
VEOR V29.B16, V0.B16, V0.B16
79+
80+
VXAR $64-21, V27.D2, V18.D2, V27.D2
81+
VXAR $64-15, V31.D2, V17.D2, V3.D2
82+
VXAR $64-10, V30.D2, V11.D2, V30.D2
83+
VXAR $64-6, V31.D2, V7.D2, V31.D2
84+
VXAR $64-3, V29.D2, V10.D2, V29.D2
85+
86+
// chi + iota
87+
VBCAX V8.B16, V22.B16, V26.B16, V20.B16
88+
VBCAX V22.B16, V23.B16, V8.B16, V21.B16
89+
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
90+
VBCAX V24.B16, V26.B16, V23.B16, V23.B16
91+
VBCAX V26.B16, V8.B16, V24.B16, V24.B16
92+
93+
VLD1R.P 8(R1), [V26.D2]
94+
95+
VBCAX V3.B16, V19.B16, V30.B16, V17.B16
96+
VBCAX V19.B16, V15.B16, V3.B16, V18.B16
97+
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
98+
VBCAX V16.B16, V30.B16, V15.B16, V15.B16
99+
VBCAX V30.B16, V3.B16, V16.B16, V16.B16
100+
101+
VBCAX V31.B16, V12.B16, V25.B16, V10.B16
102+
VBCAX V12.B16, V13.B16, V31.B16, V11.B16
103+
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
104+
VBCAX V14.B16, V25.B16, V13.B16, V13.B16
105+
VBCAX V25.B16, V31.B16, V14.B16, V14.B16
106+
107+
VBCAX V4.B16, V9.B16, V29.B16, V7.B16
108+
VBCAX V9.B16, V5.B16, V4.B16, V8.B16
109+
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
110+
VBCAX V6.B16, V29.B16, V5.B16, V5.B16
111+
VBCAX V29.B16, V4.B16, V6.B16, V6.B16
112+
113+
VBCAX V28.B16, V0.B16, V27.B16, V3.B16
114+
VBCAX V0.B16, V1.B16, V28.B16, V4.B16
115+
VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (for chi part)
116+
VBCAX V2.B16, V27.B16, V1.B16, V1.B16
117+
VBCAX V27.B16, V28.B16, V2.B16, V2.B16
118+
119+
VEOR V26.B16, V0.B16, V0.B16 // iota
120+
121+
SUBS $1, R2, R2
122+
BNE loop
123+
124+
VST1.P [V0.D1, V1.D1], 16(R0)
125+
VST1.P [V2.D1, V3.D1], 16(R0)
126+
VST1.P [V4.D1, V5.D1], 16(R0)
127+
VST1.P [V6.D1, V7.D1], 16(R0)
128+
VST1.P [V8.D1, V9.D1], 16(R0)
129+
VST1.P [V10.D1, V11.D1], 16(R0)
130+
VST1.P [V12.D1, V13.D1], 16(R0)
131+
VST1.P [V14.D1, V15.D1], 16(R0)
132+
VST1.P [V16.D1, V17.D1], 16(R0)
133+
VST1.P [V18.D1, V19.D1], 16(R0)
134+
VST1.P [V20.D1, V21.D1], 16(R0)
135+
VST1.P [V22.D1, V23.D1], 16(R0)
136+
VST1 [V24.D1], (R0)
137+
138+
RET
139+
140+
DATA round_consts+0x00(SB)/8, $0x0000000000000001
141+
DATA round_consts+0x08(SB)/8, $0x0000000000008082
142+
DATA round_consts+0x10(SB)/8, $0x800000000000808a
143+
DATA round_consts+0x18(SB)/8, $0x8000000080008000
144+
DATA round_consts+0x20(SB)/8, $0x000000000000808b
145+
DATA round_consts+0x28(SB)/8, $0x0000000080000001
146+
DATA round_consts+0x30(SB)/8, $0x8000000080008081
147+
DATA round_consts+0x38(SB)/8, $0x8000000000008009
148+
DATA round_consts+0x40(SB)/8, $0x000000000000008a
149+
DATA round_consts+0x48(SB)/8, $0x0000000000000088
150+
DATA round_consts+0x50(SB)/8, $0x0000000080008009
151+
DATA round_consts+0x58(SB)/8, $0x000000008000000a
152+
DATA round_consts+0x60(SB)/8, $0x000000008000808b
153+
DATA round_consts+0x68(SB)/8, $0x800000000000008b
154+
DATA round_consts+0x70(SB)/8, $0x8000000000008089
155+
DATA round_consts+0x78(SB)/8, $0x8000000000008003
156+
DATA round_consts+0x80(SB)/8, $0x8000000000008002
157+
DATA round_consts+0x88(SB)/8, $0x8000000000000080
158+
DATA round_consts+0x90(SB)/8, $0x000000000000800a
159+
DATA round_consts+0x98(SB)/8, $0x800000008000000a
160+
DATA round_consts+0xA0(SB)/8, $0x8000000080008081
161+
DATA round_consts+0xA8(SB)/8, $0x8000000000008080
162+
DATA round_consts+0xB0(SB)/8, $0x0000000080000001
163+
DATA round_consts+0xB8(SB)/8, $0x8000000080008008
164+
GLOBL round_consts(SB), (8+16), $192

sha3/keccakf_noasm.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build (!arm64 && !s390x && !ppc64le) || !arm || !gc || purego
6+
// +build !arm64,!s390x,!ppc64le !gc purego !arm
7+
8+
package sha3
9+
10+
// Use generic implementation
11+
func keccakF1600(a *[25]uint64) {
12+
keccakF1600Generic(a)
13+
}

0 commit comments

Comments
 (0)