Skip to content

Commit 3cb41be

Browse files
ceseolaboger
authored andcommitted
math/big: improve performance for AddMulVVW and mulAddVWW for ppc64x
This change adds a better implementation in asm for AddMulVVW and mulAddVWW for ppc64x, with speedups up to 1.54x. benchmark old ns/op new ns/op delta BenchmarkAddMulVVW/1-8 6.58 6.29 -4.41% BenchmarkAddMulVVW/2-8 7.43 7.25 -2.42% BenchmarkAddMulVVW/3-8 8.95 8.15 -8.94% BenchmarkAddMulVVW/4-8 10.1 9.37 -7.23% BenchmarkAddMulVVW/5-8 12.0 10.7 -10.83% BenchmarkAddMulVVW/10-8 22.1 20.1 -9.05% BenchmarkAddMulVVW/100-8 211 154 -27.01% BenchmarkAddMulVVW/1000-8 2046 1450 -29.13% BenchmarkAddMulVVW/10000-8 20407 14793 -27.51% BenchmarkAddMulVVW/100000-8 223857 145548 -34.98% benchmark old MB/s new MB/s speedup BenchmarkAddMulVVW/1-8 9719.88 10175.79 1.05x BenchmarkAddMulVVW/2-8 17233.97 17657.54 1.02x BenchmarkAddMulVVW/3-8 21446.05 23550.49 1.10x BenchmarkAddMulVVW/4-8 25375.70 27334.33 1.08x BenchmarkAddMulVVW/5-8 26650.52 30029.34 1.13x BenchmarkAddMulVVW/10-8 28984.29 31833.68 1.10x BenchmarkAddMulVVW/100-8 30249.41 41531.69 1.37x BenchmarkAddMulVVW/1000-8 31273.35 44108.54 1.41x BenchmarkAddMulVVW/10000-8 31360.47 43263.54 1.38x BenchmarkAddMulVVW/100000-8 28589.58 43971.66 1.54x Change-Id: I8a8105d4da3592afdef3125757a99f378a0254bb Reviewed-on: https://go-review.googlesource.com/53931 Run-TryBot: Lynn Boger <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Lynn Boger <[email protected]>
1 parent 92cfd07 commit 3cb41be

File tree

1 file changed

+42
-86
lines changed

1 file changed

+42
-86
lines changed

src/math/big/arith_ppc64x.s

Lines changed: 42 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -98,103 +98,59 @@ TEXT ·shrVU(SB), NOSPLIT, $0
9898

9999
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
100100
TEXT ·mulAddVWW(SB), NOSPLIT, $0
101-
MOVD z+0(FP), R10
102-
MOVD x+24(FP), R8
103-
MOVD y+48(FP), R9
104-
MOVD r+56(FP), R4 // c = r
105-
MOVD z_len+8(FP), R11
106-
MOVD $0, R3 // i = 0
107-
MOVD $8, R18
108-
MOVD $1, R19
109-
110-
JMP e5
111-
112-
l5:
113-
MULLD R18, R3, R5
114-
MOVD (R8)(R5), R20
115-
MULLD R9, R20, R6
116-
MULHDU R9, R20, R7
117-
ADDC R4, R6
118-
ADDZE R7
119-
MOVD R6, (R10)(R5)
120-
MOVD R7, R4
121-
ADD R19, R3
101+
MOVD z+0(FP), R10 // R10 = z[]
102+
MOVD x+24(FP), R8 // R8 = x[]
103+
MOVD y+48(FP), R9 // R9 = y
104+
MOVD r+56(FP), R4 // R4 = r = c
105+
MOVD z_len+8(FP), R11 // R11 = z_len
122106

123-
e5:
124-
CMP R3, R11
125-
BLT l5
107+
MOVD R0, R3 // R3 will be the index register
108+
CMP R0, R11
109+
MOVD R11, CTR // Initialize loop counter
110+
BEQ done
126111

112+
loop:
113+
MOVD (R8)(R3), R20 // x[i]
114+
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
115+
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
116+
ADDC R4, R6 // Compute sum for z1 and z0
117+
ADDZE R7
118+
MOVD R6, (R10)(R3) // z[i]
119+
MOVD R7, R4 // c
120+
ADD $8, R3
121+
BC 16, 0, loop // bdnz
122+
123+
done:
127124
MOVD R4, c+64(FP)
128125
RET
129126

130127
// func addMulVVW(z, x []Word, y Word) (c Word)
131128
TEXT ·addMulVVW(SB), NOSPLIT, $0
132-
MOVD z+0(FP), R10
133-
MOVD x+24(FP), R8
134-
MOVD y+48(FP), R9
135-
MOVD z_len+8(FP), R22
136-
137-
MOVD $0, R5 // i = 0
138-
MOVD $0, R4 // c = 0
139-
MOVD $8, R28
140-
MOVD $-2, R23
141-
AND R22, R23 // mask the last bit of z.len
142-
MOVD $2, R24
143-
CMP R23, R24
144-
BGE unrolled
145-
JMP end
146-
147-
unrolled:
148-
MOVD $8, R19 // no (RA)(RB*8) on power
149-
MULLD R5, R19
150-
MOVD (R10)(R19), R11 // R11 = z[i]
151-
MOVD (R8)(R19), R16 // R16 = x[i]
152-
ADD R28, R19, R25
153-
MOVD (R10)(R25), R17
154-
MOVD (R8)(R25), R18
155-
156-
MULLD R9, R16, R12
157-
MULHDU R9, R16, R14
158-
MULLD R9, R18, R6
159-
MULHDU R9, R18, R7
160-
ADDC R4, R12
161-
ADDZE R14
162-
ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
163-
ADDZE R14 // carry = high order bits + add carry
164-
MOVD R12, (R10)(R19)
165-
ADDC R14, R6
166-
ADDZE R7
167-
ADDC R17, R6
168-
ADDZE R7
169-
MOVD R6, (R10)(R25)
170-
MOVD R7, R4
129+
MOVD z+0(FP), R10 // R10 = z[]
130+
MOVD x+24(FP), R8 // R8 = x[]
131+
MOVD y+48(FP), R9 // R9 = y
132+
MOVD z_len+8(FP), R22 // R22 = z_len
171133

172-
ADD R24, R5
173-
CMP R5, R23
174-
BLT unrolled
175-
JMP end
134+
MOVD R0, R3 // R3 will be the index register
135+
CMP R0, R22
136+
MOVD R0, R4 // R4 = c = 0
137+
MOVD R22, CTR // Initialize loop counter
138+
BEQ done
176139

177140
loop:
178-
MOVD $8, R19
179-
MULLD R5, R19
180-
MOVD (R10)(R19), R11
181-
MOVD (R8)(R19), R16
182-
MULLD R9, R16, R12
183-
MULHDU R9, R16, R14
184-
ADDC R4, R12
185-
ADDZE R14
186-
ADDC R11, R12
187-
ADDZE R14
188-
MOVD R12, (R10)(R19)
189-
MOVD R14, R4
190-
191-
MOVD $1, R15
192-
ADD R15, R5
193-
194-
end:
195-
CMP R5, R22
196-
BLT loop
141+
MOVD (R8)(R3), R20 // Load x[i]
142+
MOVD (R10)(R3), R21 // Load z[i]
143+
MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
144+
MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
145+
ADDC R21, R6 // R6 = z0
146+
ADDZE R7 // R7 = z1
147+
ADDC R4, R6 // R6 = z0 + c + 0
148+
ADDZE R7, R4 // c += z1
149+
MOVD R6, (R10)(R3) // Store z[i]
150+
ADD $8, R3
151+
BC 16, 0, loop // bdnz
197152

153+
done:
198154
MOVD R4, c+56(FP)
199155
RET
200156

0 commit comments

Comments
 (0)