Skip to content

Commit 4955147

Browse files
Ethan Millermundaym
Ethan Miller
authored andcommitted
math/big: add assembly implementation of arith for ppc64{le}
The existing implementation used a pure go implementation, leading to slow cryptographic performance. Implemented mulWW, subVV, mulAddVWW, addMulVVW, and bitLen for ppc64{le}. Implemented divWW for ppc64le only, as the DIVDEU instruction is only available on Power8 or newer. benchcmp output: benchmark old ns/op new ns/op delta BenchmarkSignP384 28934360 10877330 -62.41% BenchmarkRSA2048Decrypt 41261033 5139930 -87.54% BenchmarkRSA2048Sign 45231300 7610985 -83.17% Benchmark3PrimeRSA2048Decrypt 20487300 2481408 -87.89% Fixes #16621 Change-Id: If8b68963bb49909bde832f2bda08a3791c4f5b7a Reviewed-on: https://go-review.googlesource.com/26951 Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Michael Munday <[email protected]>
1 parent 0a7c73b commit 4955147

File tree

7 files changed

+259
-19
lines changed

7 files changed

+259
-19
lines changed

src/cmd/internal/obj/ppc64/a.out.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,10 @@ const (
483483
ACMPWU
484484
ADIVD
485485
ADIVDCC
486+
ADIVDE
487+
ADIVDECC
488+
ADIVDEU
489+
ADIVDEUCC
486490
ADIVDVCC
487491
ADIVDV
488492
ADIVDU

src/cmd/internal/obj/ppc64/anames.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,10 @@ var Anames = []string{
242242
"CMPWU",
243243
"DIVD",
244244
"DIVDCC",
245+
"DIVDE",
246+
"DIVDECC",
247+
"DIVDEU",
248+
"DIVDEUCC",
245249
"DIVDVCC",
246250
"DIVDV",
247251
"DIVDU",

src/cmd/internal/obj/ppc64/asm9.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,10 @@ func buildop(ctxt *obj.Link) {
10091009
opset(AMULLDV, r0)
10101010
opset(ADIVD, r0)
10111011
opset(ADIVDCC, r0)
1012+
opset(ADIVDE, r0)
1013+
opset(ADIVDEU, r0)
1014+
opset(ADIVDECC, r0)
1015+
opset(ADIVDEUCC, r0)
10121016
opset(ADIVDVCC, r0)
10131017
opset(ADIVDV, r0)
10141018
opset(ADIVDU, r0)
@@ -2670,6 +2674,18 @@ func oprrr(ctxt *obj.Link, a obj.As) uint32 {
26702674
case AREMDCC, ADIVDCC:
26712675
return OPVCC(31, 489, 0, 1)
26722676

2677+
case ADIVDE:
2678+
return OPVCC(31, 425, 0, 0)
2679+
2680+
case ADIVDECC:
2681+
return OPVCC(31, 425, 0, 1)
2682+
2683+
case ADIVDEU:
2684+
return OPVCC(31, 393, 0, 0)
2685+
2686+
case ADIVDEUCC:
2687+
return OPVCC(31, 393, 0, 1)
2688+
26732689
case AREMDV, ADIVDV:
26742690
return OPVCC(31, 489, 1, 0)
26752691

src/crypto/ecdsa/ecdsa_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,18 @@ func BenchmarkSignP256(b *testing.B) {
5454
}
5555
}
5656

57+
func BenchmarkSignP384(b *testing.B) {
58+
b.ResetTimer()
59+
p384 := elliptic.P384()
60+
hashed := []byte("testing")
61+
priv, _ := GenerateKey(p384, rand.Reader)
62+
63+
b.ResetTimer()
64+
for i := 0; i < b.N; i++ {
65+
_, _, _ = Sign(rand.Reader, priv, hashed)
66+
}
67+
}
68+
5769
func BenchmarkVerifyP256(b *testing.B) {
5870
b.ResetTimer()
5971
p256 := elliptic.P256()

src/math/big/arith_ppc64.s

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2016 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build !math_big_pure_go,ppc64
6+
7+
#include "textflag.h"
8+
9+
// This file provides fast assembly versions for the elementary
10+
// arithmetic operations on vectors implemented in arith.go.
11+
12+
TEXT ·divWW(SB), NOSPLIT, $0
13+
BR ·divWW_g(SB)
14+

src/math/big/arith_ppc64le.s

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright 2016 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build !math_big_pure_go,ppc64le
6+
7+
#include "textflag.h"
8+
9+
// This file provides fast assembly versions for the elementary
10+
// arithmetic operations on vectors implemented in arith.go.
11+
12+
// func divWW(x1, x0, y Word) (q, r Word)
13+
TEXT ·divWW(SB), NOSPLIT, $0
14+
MOVD x1+0(FP), R4
15+
MOVD x0+8(FP), R5
16+
MOVD y+16(FP), R6
17+
18+
CMPU R4, R6
19+
BGE divbigger
20+
21+
// from the programmer's note in ch. 3 of the ISA manual, p.74
22+
DIVDEU R6, R4, R3
23+
DIVDU R6, R5, R7
24+
MULLD R6, R3, R8
25+
MULLD R6, R7, R20
26+
SUB R20, R5, R10
27+
ADD R7, R3, R3
28+
SUB R8, R10, R4
29+
CMPU R4, R10
30+
BLT adjust
31+
CMPU R4, R6
32+
BLT end
33+
34+
adjust:
35+
MOVD $1, R21
36+
ADD R21, R3, R3
37+
SUB R6, R4, R4
38+
39+
end:
40+
MOVD R3, q+24(FP)
41+
MOVD R4, r+32(FP)
42+
43+
RET
44+
45+
divbigger:
46+
MOVD $-1, R7
47+
MOVD R7, q+24(FP)
48+
MOVD R7, r+32(FP)
49+
RET
50+

src/math/big/arith_ppc64x.s

Lines changed: 159 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,178 @@
99
// This file provides fast assembly versions for the elementary
1010
// arithmetic operations on vectors implemented in arith.go.
1111

12-
TEXT ·mulWW(SB),NOSPLIT,$0
13-
BR ·mulWW_g(SB)
12+
// func mulWW(x, y Word) (z1, z0 Word)
13+
TEXT ·mulWW(SB), NOSPLIT, $0
14+
MOVD x+0(FP), R4
15+
MOVD y+8(FP), R5
16+
MULHDU R4, R5, R6
17+
MULLD R4, R5, R7
18+
MOVD R6, z1+16(FP)
19+
MOVD R7, z0+24(FP)
20+
RET
1421

15-
TEXT ·divWW(SB),NOSPLIT,$0
16-
BR ·divWW_g(SB)
17-
18-
TEXT ·addVV(SB),NOSPLIT,$0
22+
TEXT ·addVV(SB), NOSPLIT, $0
1923
BR ·addVV_g(SB)
2024

21-
TEXT ·subVV(SB),NOSPLIT,$0
22-
BR ·subVV_g(SB)
25+
// func subVV(z, x, y []Word) (c Word)
26+
// z[i] = x[i] - y[i] for all i, carrying
27+
TEXT ·subVV(SB), NOSPLIT, $0
28+
MOVD z_len+8(FP), R7
29+
MOVD x+24(FP), R8
30+
MOVD y+48(FP), R9
31+
MOVD z+0(FP), R10
32+
33+
MOVD $0, R4 // c = 0
34+
MOVD $0, R5 // i = 0
35+
MOVD $1, R29 // work around lack of ADDI
36+
MOVD $8, R28 // work around lack of scaled addressing
37+
38+
SUBC R0, R0 // clear CA
39+
JMP sublend
40+
41+
// amd64 saves and restores CF, but I believe they only have to do that because all of
42+
// their math operations clobber it - we should just be able to recover it at the end.
43+
subloop:
44+
MULLD R5, R28, R6
45+
MOVD (R8)(R6), R11 // x[i]
46+
MOVD (R9)(R6), R12 // y[i]
47+
48+
SUBE R12, R11, R15
49+
MOVD R15, (R10)(R6)
2350

24-
TEXT ·addVW(SB),NOSPLIT,$0
51+
ADD R29, R5 // i++
52+
53+
sublend:
54+
CMP R5, R7
55+
BLT subloop
56+
57+
ADDZE R4
58+
XOR R29, R4
59+
MOVD R4, c+72(FP)
60+
RET
61+
62+
TEXT ·addVW(SB), NOSPLIT, $0
2563
BR ·addVW_g(SB)
2664

27-
TEXT ·subVW(SB),NOSPLIT,$0
65+
TEXT ·subVW(SB), NOSPLIT, $0
2866
BR ·subVW_g(SB)
2967

30-
TEXT ·shlVU(SB),NOSPLIT,$0
68+
TEXT ·shlVU(SB), NOSPLIT, $0
3169
BR ·shlVU_g(SB)
3270

33-
TEXT ·shrVU(SB),NOSPLIT,$0
71+
TEXT ·shrVU(SB), NOSPLIT, $0
3472
BR ·shrVU_g(SB)
3573

36-
TEXT ·mulAddVWW(SB),NOSPLIT,$0
37-
BR ·mulAddVWW_g(SB)
74+
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
75+
TEXT ·mulAddVWW(SB), NOSPLIT, $0
76+
MOVD z+0(FP), R10
77+
MOVD x+24(FP), R8
78+
MOVD y+48(FP), R9
79+
MOVD r+56(FP), R4 // c = r
80+
MOVD z_len+8(FP), R11
81+
MOVD $0, R3 // i = 0
82+
MOVD $8, R18
83+
MOVD $1, R19
84+
85+
JMP e5
86+
87+
l5:
88+
MULLD R18, R3, R5
89+
MOVD (R8)(R5), R20
90+
MULLD R9, R20, R6
91+
MULHDU R9, R20, R7
92+
ADDC R4, R6
93+
ADDZE R7
94+
MOVD R6, (R10)(R5)
95+
MOVD R7, R4
96+
ADD R19, R3
97+
98+
e5:
99+
CMP R3, R11
100+
BLT l5
101+
102+
MOVD R4, c+64(FP)
103+
RET
104+
105+
// func addMulVVW(z, x []Word, y Word) (c Word)
106+
TEXT ·addMulVVW(SB), NOSPLIT, $0
107+
MOVD z+0(FP), R10
108+
MOVD x+24(FP), R8
109+
MOVD y+48(FP), R9
110+
MOVD z_len+8(FP), R22
111+
112+
MOVD $0, R5 // i = 0
113+
MOVD $0, R4 // c = 0
114+
MOVD $8, R28
115+
MOVD $-2, R23
116+
AND R22, R23 // mask the last bit of z.len
117+
MOVD $2, R24
118+
CMP R23, R24
119+
BGE unrolled
120+
JMP end
121+
122+
unrolled:
123+
MOVD $8, R19 // no (RA)(RB*8) on power
124+
MULLD R5, R19
125+
MOVD (R10)(R19), R11 // R11 = z[i]
126+
MOVD (R8)(R19), R16 // R16 = x[i]
127+
ADD R28, R19, R25
128+
MOVD (R10)(R25), R17
129+
MOVD (R8)(R25), R18
130+
131+
MULLD R9, R16, R12
132+
MULHDU R9, R16, R14
133+
MULLD R9, R18, R6
134+
MULHDU R9, R18, R7
135+
ADDC R4, R12
136+
ADDZE R14
137+
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
138+
ADDZE R14 // carry = high order bits + add carry
139+
MOVD R12, (R10)(R19)
140+
ADDC R14, R6
141+
ADDZE R7
142+
ADDC R17, R6
143+
ADDZE R7
144+
MOVD R6, (R10)(R25)
145+
MOVD R7, R4
146+
147+
ADD R24, R5
148+
CMP R5, R23
149+
BLT unrolled
150+
JMP end
151+
152+
loop:
153+
MOVD $8, R19
154+
MULLD R5, R19
155+
MOVD (R10)(R19), R11
156+
MOVD (R8)(R19), R16
157+
MULLD R9, R16, R12
158+
MULHDU R9, R16, R14
159+
ADDC R4, R12
160+
ADDZE R14
161+
ADDC R11, R12
162+
ADDZE R14
163+
MOVD R12, (R10)(R19)
164+
MOVD R14, R4
165+
166+
MOVD $1, R15
167+
ADD R15, R5
168+
169+
end:
170+
CMP R5, R22
171+
BLT loop
38172

39-
TEXT ·addMulVVW(SB),NOSPLIT,$0
40-
BR ·addMulVVW_g(SB)
173+
MOVD R4, c+56(FP)
174+
RET
41175

42-
TEXT ·divWVW(SB),NOSPLIT,$0
176+
TEXT ·divWVW(SB), NOSPLIT, $0
43177
BR ·divWVW_g(SB)
44178

45-
TEXT ·bitLen(SB),NOSPLIT,$0
46-
BR ·bitLen_g(SB)
179+
// func bitLen(x Word) int
180+
TEXT ·bitLen(SB), NOSPLIT, $0
181+
MOVD x+0(FP), R4
182+
CNTLZD R4, R4
183+
MOVD $64, R5
184+
SUB R4, R5
185+
MOVD R5, n+8(FP)
186+
RET

0 commit comments

Comments
 (0)