Skip to content

Commit 849dba0

Browse files
committed
runtime: port performance-critical functions to regabi
This CL ports a few performance-critical runtime assembly functions to use register arguments directly. While using the faster ABI is nice, the real win here is that we avoid ABI wrappers: since these are "builtin" functions in the compiler, it can generate calls to them without knowing that their native implementation is ABI0. Hence, it generates ABIInternal calls that go through ABI wrappers. By porting them to use ABIInternal natively, we avoid the overhead of the ABI wrapper. This significantly improves performance on several benchmarks, comparing regabiwrappers before and after this change: name old time/op new time/op delta BiogoIgor 15.7s ± 2% 15.7s ± 2% ~ (p=0.617 n=25+25) BiogoKrishna 18.5s ± 5% 17.7s ± 2% -4.61% (p=0.000 n=25+25) BleveIndexBatch100 5.91s ± 3% 5.82s ± 3% -1.60% (p=0.000 n=25+25) BleveQuery 6.76s ± 0% 6.60s ± 1% -2.31% (p=0.000 n=22+25) CompileTemplate 248ms ± 5% 245ms ± 1% ~ (p=0.643 n=25+20) CompileUnicode 94.4ms ± 3% 93.9ms ± 2% ~ (p=0.152 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.059 n=24+24) CompileCompiler 104ms ± 3% 103ms ± 1% ~ (p=0.056 n=25+22) CompileSSA 10.9s ± 1% 10.9s ± 1% ~ (p=0.052 n=25+25) CompileFlate 156ms ± 8% 152ms ± 1% -2.49% (p=0.008 n=25+21) CompileGoParser 248ms ± 1% 249ms ± 2% ~ (p=0.058 n=21+20) CompileReflect 595ms ± 3% 601ms ± 4% ~ (p=0.182 n=25+25) CompileTar 211ms ± 2% 211ms ± 1% ~ (p=0.663 n=23+23) CompileXML 282ms ± 2% 284ms ± 5% ~ (p=0.456 n=21+23) CompileStdCmd 13.6s ± 2% 13.5s ± 2% ~ (p=0.112 n=25+24) FoglemanFauxGLRenderRotateBoat 8.69s ± 2% 8.67s ± 0% ~ (p=0.094 n=22+25) FoglemanPathTraceRenderGopherIter1 20.2s ± 2% 20.7s ± 3% +2.53% (p=0.000 n=24+24) GopherLuaKNucleotide 31.4s ± 1% 31.0s ± 1% -1.28% (p=0.000 n=25+24) MarkdownRenderXHTML 246ms ± 1% 244ms ± 1% -0.79% (p=0.000 n=20+21) Tile38WithinCircle100kmRequest 843µs ± 4% 818µs ± 4% -2.93% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.06ms ± 5% 1.05ms ± 3% -1.19% (p=0.021 n=24+25) Tile38KNearestLimit100Request 1.01ms ± 1% 1.01ms ± 2% ~ (p=0.335 n=22+25) [Geo mean] 596ms 592ms -0.71% (https://perf.golang.org/search?q=upload:20210411.5) It also significantly reduces the performance penalty of enabling regabiwrappers, though it doesn't yet fully close the gap on all benchmarks: name old time/op new time/op delta BiogoIgor 15.7s ± 1% 15.7s ± 2% ~ (p=0.366 n=24+25) BiogoKrishna 17.7s ± 2% 17.7s ± 2% ~ (p=0.315 n=23+25) BleveIndexBatch100 5.86s ± 4% 5.82s ± 3% ~ (p=0.137 n=24+25) BleveQuery 6.55s ± 0% 6.60s ± 1% +0.83% (p=0.000 n=24+25) CompileTemplate 244ms ± 1% 245ms ± 1% ~ (p=0.208 n=21+20) CompileUnicode 94.0ms ± 4% 93.9ms ± 2% ~ (p=0.666 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.154 n=25+24) CompileCompiler 103ms ± 1% 103ms ± 1% ~ (p=0.905 n=24+22) CompileSSA 10.9s ± 2% 10.9s ± 1% ~ (p=0.803 n=25+25) CompileFlate 153ms ± 1% 152ms ± 1% ~ (p=0.182 n=23+21) CompileGoParser 250ms ± 2% 249ms ± 2% ~ (p=0.843 n=24+20) CompileReflect 595ms ± 4% 601ms ± 4% ~ (p=0.141 n=25+25) CompileTar 212ms ± 3% 211ms ± 1% ~ (p=0.499 n=23+23) CompileXML 282ms ± 1% 284ms ± 5% ~ (p=0.129 n=20+23) CompileStdCmd 13.5s ± 2% 13.5s ± 2% ~ (p=0.480 n=24+24) FoglemanFauxGLRenderRotateBoat 8.66s ± 1% 8.67s ± 0% ~ (p=0.325 n=25+25) FoglemanPathTraceRenderGopherIter1 20.6s ± 3% 20.7s ± 3% ~ (p=0.137 n=25+24) GopherLuaKNucleotide 30.5s ± 2% 31.0s ± 1% +1.68% (p=0.000 n=23+24) MarkdownRenderXHTML 243ms ± 1% 244ms ± 1% +0.51% (p=0.000 n=23+21) Tile38WithinCircle100kmRequest 801µs ± 2% 818µs ± 4% +2.11% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.01ms ± 2% 1.05ms ± 3% +4.34% (p=0.000 n=24+25) Tile38KNearestLimit100Request 1.00ms ± 1% 1.01ms ± 2% +0.81% (p=0.008 n=21+25) [Geo mean] 589ms 592ms +0.50% (https://perf.golang.org/search?q=upload:20210411.6) Change-Id: I8f77f010b0abc658064df569a27a9c7a7b1c7bf9 Reviewed-on: https://go-review.googlesource.com/c/go/+/308931 Trust: Austin Clements <[email protected]> Run-TryBot: Austin Clements <[email protected]> Reviewed-by: Cherry Zhang <[email protected]> TryBot-Result: Go Bot <[email protected]>
1 parent 865d2bc commit 849dba0

File tree

4 files changed

+174
-58
lines changed

4 files changed

+174
-58
lines changed

src/runtime/asm_amd64.s

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1011,34 +1011,62 @@ done:
10111011

10121012
// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
10131013
// hash function using AES hardware instructions
1014-
TEXT runtime·memhash(SB),NOSPLIT,$0-32
1014+
TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT,$0-32
1015+
#ifdef GOEXPERIMENT_regabiargs
1016+
// AX = ptr to data
1017+
// BX = seed
1018+
// CX = size
1019+
#endif
10151020
CMPB runtime·useAeshash(SB), $0
10161021
JEQ noaes
1022+
#ifndef GOEXPERIMENT_regabiargs
10171023
MOVQ p+0(FP), AX // ptr to data
10181024
MOVQ s+16(FP), CX // size
10191025
LEAQ ret+24(FP), DX
1026+
#endif
10201027
JMP aeshashbody<>(SB)
10211028
noaes:
1022-
JMP runtime·memhashFallback(SB)
1029+
JMP runtime·memhashFallback<ABIInternal>(SB)
10231030

10241031
// func strhash(p unsafe.Pointer, h uintptr) uintptr
1025-
TEXT runtime·strhash(SB),NOSPLIT,$0-24
1032+
TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT,$0-24
1033+
#ifdef GOEXPERIMENT_regabiargs
1034+
// AX = ptr to string struct
1035+
// BX = seed
1036+
#endif
10261037
CMPB runtime·useAeshash(SB), $0
10271038
JEQ noaes
1039+
#ifndef GOEXPERIMENT_regabiargs
10281040
MOVQ p+0(FP), AX // ptr to string struct
1041+
#endif
10291042
MOVQ 8(AX), CX // length of string
10301043
MOVQ (AX), AX // string data
1044+
#ifndef GOEXPERIMENT_regabiargs
10311045
LEAQ ret+16(FP), DX
1046+
#endif
10321047
JMP aeshashbody<>(SB)
10331048
noaes:
1034-
JMP runtime·strhashFallback(SB)
1049+
JMP runtime·strhashFallback<ABIInternal>(SB)
10351050

10361051
// AX: data
1052+
#ifdef GOEXPERIMENT_regabiargs
1053+
// BX: hash seed
1054+
#else
1055+
// h+8(FP): hash seed
1056+
#endif
10371057
// CX: length
1058+
#ifdef GOEXPERIMENT_regabiargs
1059+
// At return: AX = return value
1060+
#else
10381061
// DX: address to put return value
1062+
#endif
10391063
TEXT aeshashbody<>(SB),NOSPLIT,$0-0
10401064
// Fill an SSE register with our seeds.
1065+
#ifdef GOEXPERIMENT_regabiargs
1066+
MOVQ BX, X0 // 64 bits of per-table hash seed
1067+
#else
10411068
MOVQ h+8(FP), X0 // 64 bits of per-table hash seed
1069+
#endif
10421070
PINSRW $4, CX, X0 // 16 bits of length
10431071
PSHUFHW $0, X0, X0 // repeat length 4 times total
10441072
MOVO X0, X1 // save unscrambled seed
@@ -1075,7 +1103,11 @@ final1:
10751103
AESENC X1, X1 // scramble combo 3 times
10761104
AESENC X1, X1
10771105
AESENC X1, X1
1106+
#ifdef GOEXPERIMENT_regabiargs
1107+
MOVQ X1, AX // return X1
1108+
#else
10781109
MOVQ X1, (DX)
1110+
#endif
10791111
RET
10801112

10811113
endofpage:
@@ -1091,7 +1123,11 @@ endofpage:
10911123
aes0:
10921124
// Return scrambled input seed
10931125
AESENC X0, X0
1126+
#ifdef GOEXPERIMENT_regabiargs
1127+
MOVQ X0, AX // return X0
1128+
#else
10941129
MOVQ X0, (DX)
1130+
#endif
10951131
RET
10961132

10971133
aes16:
@@ -1121,7 +1157,11 @@ aes17to32:
11211157

11221158
// combine results
11231159
PXOR X3, X2
1160+
#ifdef GOEXPERIMENT_regabiargs
1161+
MOVQ X2, AX // return X2
1162+
#else
11241163
MOVQ X2, (DX)
1164+
#endif
11251165
RET
11261166

11271167
aes33to64:
@@ -1163,7 +1203,11 @@ aes33to64:
11631203
PXOR X6, X4
11641204
PXOR X7, X5
11651205
PXOR X5, X4
1206+
#ifdef GOEXPERIMENT_regabiargs
1207+
MOVQ X4, AX // return X4
1208+
#else
11661209
MOVQ X4, (DX)
1210+
#endif
11671211
RET
11681212

11691213
aes65to128:
@@ -1245,7 +1289,15 @@ aes65to128:
12451289
PXOR X10, X8
12461290
PXOR X11, X9
12471291
PXOR X9, X8
1292+
#ifdef GOEXPERIMENT_regabig
1293+
// X15 must be zero on return
1294+
PXOR X15, X15
1295+
#endif
1296+
#ifdef GOEXPERIMENT_regabiargs
1297+
MOVQ X8, AX // return X8
1298+
#else
12481299
MOVQ X8, (DX)
1300+
#endif
12491301
RET
12501302

12511303
aes129plus:
@@ -1361,38 +1413,73 @@ aesloop:
13611413
PXOR X10, X8
13621414
PXOR X11, X9
13631415
PXOR X9, X8
1416+
#ifdef GOEXPERIMENT_regabig
1417+
// X15 must be zero on return
1418+
PXOR X15, X15
1419+
#endif
1420+
#ifdef GOEXPERIMENT_regabiargs
1421+
MOVQ X8, AX // return X8
1422+
#else
13641423
MOVQ X8, (DX)
1424+
#endif
13651425
RET
13661426

13671427
// func memhash32(p unsafe.Pointer, h uintptr) uintptr
1368-
TEXT runtime·memhash32(SB),NOSPLIT,$0-24
1428+
// ABIInternal for performance.
1429+
TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
1430+
#ifdef GOEXPERIMENT_regabiargs
1431+
// AX = ptr to data
1432+
// BX = seed
1433+
#endif
13691434
CMPB runtime·useAeshash(SB), $0
13701435
JEQ noaes
1436+
#ifdef GOEXPERIMENT_regabiargs
1437+
MOVQ BX, X0 // X0 = seed
1438+
#else
13711439
MOVQ p+0(FP), AX // ptr to data
13721440
MOVQ h+8(FP), X0 // seed
1441+
#endif
13731442
PINSRD $2, (AX), X0 // data
13741443
AESENC runtime·aeskeysched+0(SB), X0
13751444
AESENC runtime·aeskeysched+16(SB), X0
13761445
AESENC runtime·aeskeysched+32(SB), X0
1446+
#ifdef GOEXPERIMENT_regabiargs
1447+
MOVQ X0, AX // return X0
1448+
#else
13771449
MOVQ X0, ret+16(FP)
1450+
#endif
13781451
RET
13791452
noaes:
1380-
JMP runtime·memhash32Fallback(SB)
1453+
JMP runtime·memhash32Fallback<ABIInternal>(SB)
13811454

13821455
// func memhash64(p unsafe.Pointer, h uintptr) uintptr
1383-
TEXT runtime·memhash64(SB),NOSPLIT,$0-24
1456+
// ABIInternal for performance.
1457+
TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
1458+
#ifdef GOEXPERIMENT_regabiargs
1459+
// AX = ptr to data
1460+
// BX = seed
1461+
#else
1462+
#endif
13841463
CMPB runtime·useAeshash(SB), $0
13851464
JEQ noaes
1465+
#ifdef GOEXPERIMENT_regabiargs
1466+
MOVQ BX, X0 // X0 = seed
1467+
#else
13861468
MOVQ p+0(FP), AX // ptr to data
13871469
MOVQ h+8(FP), X0 // seed
1470+
#endif
13881471
PINSRQ $1, (AX), X0 // data
13891472
AESENC runtime·aeskeysched+0(SB), X0
13901473
AESENC runtime·aeskeysched+16(SB), X0
13911474
AESENC runtime·aeskeysched+32(SB), X0
1475+
#ifdef GOEXPERIMENT_regabiargs
1476+
MOVQ X0, AX // return X0
1477+
#else
13921478
MOVQ X0, ret+16(FP)
1479+
#endif
13931480
RET
13941481
noaes:
1395-
JMP runtime·memhash64Fallback(SB)
1482+
JMP runtime·memhash64Fallback<ABIInternal>(SB)
13961483

13971484
// simple mask to get rid of data in the high part of the register.
13981485
DATA masks<>+0x00(SB)/8, $0x0000000000000000

src/runtime/memclr_amd64.s

Lines changed: 57 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,16 @@
1212
// See memclrNoHeapPointers Go doc for important implementation constraints.
1313

1414
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
15-
TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
15+
// ABIInternal for performance.
16+
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
17+
#ifdef GOEXPERIMENT_regabiargs
18+
// AX = ptr
19+
// BX = n
20+
MOVQ AX, DI // DI = ptr
21+
#else
1622
MOVQ ptr+0(FP), DI
1723
MOVQ n+8(FP), BX
24+
#endif
1825
XORQ AX, AX
1926

2027
// MOVOU seems always faster than REP STOSQ.
@@ -31,7 +38,9 @@ tail:
3138
JE _8
3239
CMPQ BX, $16
3340
JBE _9through16
34-
PXOR X0, X0
41+
#ifndef GOEXPERIMENT_regabig
42+
PXOR X15, X15
43+
#endif
3544
CMPQ BX, $32
3645
JBE _17through32
3746
CMPQ BX, $64
@@ -45,22 +54,22 @@ tail:
4554
// TODO: for really big clears, use MOVNTDQ, even without AVX2.
4655

4756
loop:
48-
MOVOU X0, 0(DI)
49-
MOVOU X0, 16(DI)
50-
MOVOU X0, 32(DI)
51-
MOVOU X0, 48(DI)
52-
MOVOU X0, 64(DI)
53-
MOVOU X0, 80(DI)
54-
MOVOU X0, 96(DI)
55-
MOVOU X0, 112(DI)
56-
MOVOU X0, 128(DI)
57-
MOVOU X0, 144(DI)
58-
MOVOU X0, 160(DI)
59-
MOVOU X0, 176(DI)
60-
MOVOU X0, 192(DI)
61-
MOVOU X0, 208(DI)
62-
MOVOU X0, 224(DI)
63-
MOVOU X0, 240(DI)
57+
MOVOU X15, 0(DI)
58+
MOVOU X15, 16(DI)
59+
MOVOU X15, 32(DI)
60+
MOVOU X15, 48(DI)
61+
MOVOU X15, 64(DI)
62+
MOVOU X15, 80(DI)
63+
MOVOU X15, 96(DI)
64+
MOVOU X15, 112(DI)
65+
MOVOU X15, 128(DI)
66+
MOVOU X15, 144(DI)
67+
MOVOU X15, 160(DI)
68+
MOVOU X15, 176(DI)
69+
MOVOU X15, 192(DI)
70+
MOVOU X15, 208(DI)
71+
MOVOU X15, 224(DI)
72+
MOVOU X15, 240(DI)
6473
SUBQ $256, BX
6574
ADDQ $256, DI
6675
CMPQ BX, $256
@@ -141,40 +150,40 @@ _9through16:
141150
MOVQ AX, -8(DI)(BX*1)
142151
RET
143152
_17through32:
144-
MOVOU X0, (DI)
145-
MOVOU X0, -16(DI)(BX*1)
153+
MOVOU X15, (DI)
154+
MOVOU X15, -16(DI)(BX*1)
146155
RET
147156
_33through64:
148-
MOVOU X0, (DI)
149-
MOVOU X0, 16(DI)
150-
MOVOU X0, -32(DI)(BX*1)
151-
MOVOU X0, -16(DI)(BX*1)
157+
MOVOU X15, (DI)
158+
MOVOU X15, 16(DI)
159+
MOVOU X15, -32(DI)(BX*1)
160+
MOVOU X15, -16(DI)(BX*1)
152161
RET
153162
_65through128:
154-
MOVOU X0, (DI)
155-
MOVOU X0, 16(DI)
156-
MOVOU X0, 32(DI)
157-
MOVOU X0, 48(DI)
158-
MOVOU X0, -64(DI)(BX*1)
159-
MOVOU X0, -48(DI)(BX*1)
160-
MOVOU X0, -32(DI)(BX*1)
161-
MOVOU X0, -16(DI)(BX*1)
163+
MOVOU X15, (DI)
164+
MOVOU X15, 16(DI)
165+
MOVOU X15, 32(DI)
166+
MOVOU X15, 48(DI)
167+
MOVOU X15, -64(DI)(BX*1)
168+
MOVOU X15, -48(DI)(BX*1)
169+
MOVOU X15, -32(DI)(BX*1)
170+
MOVOU X15, -16(DI)(BX*1)
162171
RET
163172
_129through256:
164-
MOVOU X0, (DI)
165-
MOVOU X0, 16(DI)
166-
MOVOU X0, 32(DI)
167-
MOVOU X0, 48(DI)
168-
MOVOU X0, 64(DI)
169-
MOVOU X0, 80(DI)
170-
MOVOU X0, 96(DI)
171-
MOVOU X0, 112(DI)
172-
MOVOU X0, -128(DI)(BX*1)
173-
MOVOU X0, -112(DI)(BX*1)
174-
MOVOU X0, -96(DI)(BX*1)
175-
MOVOU X0, -80(DI)(BX*1)
176-
MOVOU X0, -64(DI)(BX*1)
177-
MOVOU X0, -48(DI)(BX*1)
178-
MOVOU X0, -32(DI)(BX*1)
179-
MOVOU X0, -16(DI)(BX*1)
173+
MOVOU X15, (DI)
174+
MOVOU X15, 16(DI)
175+
MOVOU X15, 32(DI)
176+
MOVOU X15, 48(DI)
177+
MOVOU X15, 64(DI)
178+
MOVOU X15, 80(DI)
179+
MOVOU X15, 96(DI)
180+
MOVOU X15, 112(DI)
181+
MOVOU X15, -128(DI)(BX*1)
182+
MOVOU X15, -112(DI)(BX*1)
183+
MOVOU X15, -96(DI)(BX*1)
184+
MOVOU X15, -80(DI)(BX*1)
185+
MOVOU X15, -64(DI)(BX*1)
186+
MOVOU X15, -48(DI)(BX*1)
187+
MOVOU X15, -32(DI)(BX*1)
188+
MOVOU X15, -16(DI)(BX*1)
180189
RET

src/runtime/memmove_amd64.s

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,20 @@
3131
// See memmove Go doc for important implementation constraints.
3232

3333
// func memmove(to, from unsafe.Pointer, n uintptr)
34-
TEXT runtime·memmove(SB), NOSPLIT, $0-24
35-
34+
// ABIInternal for performance.
35+
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
36+
#ifdef GOEXPERIMENT_regabiargs
37+
// AX = to
38+
// BX = from
39+
// CX = n
40+
MOVQ AX, DI
41+
MOVQ BX, SI
42+
MOVQ CX, BX
43+
#else
3644
MOVQ to+0(FP), DI
3745
MOVQ from+8(FP), SI
3846
MOVQ n+16(FP), BX
47+
#endif
3948

4049
// REP instructions have a high startup cost, so we handle small sizes
4150
// with some straightline code. The REP MOVSQ instruction is really fast
@@ -244,6 +253,10 @@ move_129through256:
244253
MOVOU X13, -48(DI)(BX*1)
245254
MOVOU X14, -32(DI)(BX*1)
246255
MOVOU X15, -16(DI)(BX*1)
256+
#ifdef GOEXPERIMENT_regabig
257+
// X15 must be zero on return
258+
PXOR X15, X15
259+
#endif
247260
RET
248261
move_256through2048:
249262
SUBQ $256, BX
@@ -283,6 +296,10 @@ move_256through2048:
283296
LEAQ 256(SI), SI
284297
LEAQ 256(DI), DI
285298
JGE move_256through2048
299+
#ifdef GOEXPERIMENT_regabig
300+
// X15 must be zero on return
301+
PXOR X15, X15
302+
#endif
286303
JMP tail
287304

288305
avxUnaligned:

0 commit comments

Comments
 (0)