Skip to content

Commit 601ea46

Browse files
cocotytygopherbot
authored andcommitted
runtime: add ERMS-based memmove support for modern CPU platforms
The current memmove implementation uses REP MOVSB to copy data larger than 2KB when the useAVXmemmove global variable is false and the CPU supports the ERMS feature. This feature is currently only enabled on CPUs in the Sandy Bridge (Client) , Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server) microarchitectures. For modern Intel CPU microarchitectures that support the ERMS feature, such as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better performance than the AVX-based copy currently implemented in memmove. Benchstat result: goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz │ ./old.txt │ ./new.txt │ │ sec/op │ sec/op vs base │ Memmove/2048-2 25.24n ± 0% 24.27n ± 0% -3.84% (p=0.000 n=10) Memmove/4096-2 44.87n ± 0% 33.16n ± 1% -26.11% (p=0.000 n=10) geomean 33.65n 28.37n -15.71% │ ./old.txt │ ./new.txt │ │ B/s │ B/s vs base │ Memmove/2048-2 75.56Gi ± 0% 78.59Gi ± 0% +4.02% (p=0.000 n=10) Memmove/4096-2 85.01Gi ± 0% 115.05Gi ± 1% +35.34% (p=0.000 n=10) geomean 80.14Gi 95.09Gi +18.65% Fixes #66958 Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d GitHub-Last-Rev: 89cf5af GitHub-Pull-Request: #66959 Reviewed-on: https://go-review.googlesource.com/c/go/+/580735 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]> Auto-Submit: Keith Randall <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent 20e18c9 commit 601ea46

File tree

4 files changed

+59
-25
lines changed

4 files changed

+59
-25
lines changed

src/internal/cpu/cpu.go

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ var X86 struct {
3737
HasBMI1 bool
3838
HasBMI2 bool
3939
HasERMS bool
40+
HasFSRM bool
4041
HasFMA bool
4142
HasOSXSAVE bool
4243
HasPCLMULQDQ bool

src/internal/cpu/cpu_x86.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ const (
4040
cpuid_SHA = 1 << 29
4141
cpuid_AVX512BW = 1 << 30
4242
cpuid_AVX512VL = 1 << 31
43-
43+
// edx bits
44+
cpuid_FSRM = 1 << 4
4445
// edx bits for CPUID 0x80000001
4546
cpuid_RDTSCP = 1 << 27
4647
)
@@ -52,6 +53,7 @@ func doinit() {
5253
{Name: "adx", Feature: &X86.HasADX},
5354
{Name: "aes", Feature: &X86.HasAES},
5455
{Name: "erms", Feature: &X86.HasERMS},
56+
{Name: "fsrm", Feature: &X86.HasFSRM},
5557
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
5658
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
5759
{Name: "sha", Feature: &X86.HasSHA},
@@ -137,7 +139,7 @@ func doinit() {
137139
return
138140
}
139141

140-
_, ebx7, _, _ := cpuid(7, 0)
142+
_, ebx7, _, edx7 := cpuid(7, 0)
141143
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
142144
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
143145
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@@ -151,6 +153,8 @@ func doinit() {
151153
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
152154
}
153155

156+
X86.HasFSRM = isSet(edx7, cpuid_FSRM)
157+
154158
var maxExtendedInformation uint32
155159
maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
156160

src/runtime/cpuflags_amd64.go

+24-10
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,31 @@ import (
88
"internal/cpu"
99
)
1010

11-
var useAVXmemmove bool
11+
var memmoveBits uint8
1212

13-
func init() {
14-
// Let's remove stepping and reserved fields
15-
processor := processorVersionInfo & 0x0FFF3FF0
13+
const (
14+
// avxSupported indicates that the CPU supports AVX instructions.
15+
avxSupported = 1 << 0
1616

17-
isIntelBridgeFamily := isIntel &&
18-
processor == 0x206A0 ||
19-
processor == 0x206D0 ||
20-
processor == 0x306A0 ||
21-
processor == 0x306E0
17+
// repmovsPreferred indicates that REP MOVSx instruction is more
18+
// efficient on the CPU.
19+
repmovsPreferred = 1 << 1
20+
)
2221

23-
useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
22+
func init() {
23+
// Here we assume that on modern CPUs with both FSRM and ERMS features,
24+
// copying data blocks of 2KB or larger using the REP MOVSB instruction
25+
// will be more efficient to avoid having to keep up with CPU generations.
26+
// Therefore, we may retain a BlockList mechanism to ensure that microarchitectures
27+
// that do not fit this case may appear in the future.
28+
// We enable it on Intel CPUs first, and we may support more platforms
29+
// in the future.
30+
isERMSNiceCPU := isIntel
31+
useREPMOV := isERMSNiceCPU && cpu.X86.HasERMS && cpu.X86.HasFSRM
32+
if cpu.X86.HasAVX {
33+
memmoveBits |= avxSupported
34+
}
35+
if useREPMOV {
36+
memmoveBits |= repmovsPreferred
37+
}
2438
}

src/runtime/memmove_amd64.s

+28-13
Original file line numberDiff line numberDiff line change
@@ -72,26 +72,34 @@ tail:
7272
CMPQ BX, $256
7373
JBE move_129through256
7474

75-
TESTB $1, runtime·useAVXmemmove(SB)
76-
JNZ avxUnaligned
77-
75+
MOVB runtime·memmoveBits(SB), AX
76+
// We have AVX but we don't want to use REP MOVSx.
77+
CMPB AX, $const_avxSupported
78+
JEQ avxUnaligned
7879
/*
7980
* check and set for backwards
8081
*/
8182
CMPQ SI, DI
8283
JLS back
8384

8485
/*
85-
* forward copy loop
86-
*/
86+
* forward copy loop
87+
*/
8788
forward:
8889
CMPQ BX, $2048
89-
JLS move_256through2048
90-
91-
// If REP MOVSB isn't fast, don't use it
92-
CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
93-
JNE fwdBy8
90+
JL check_avx
91+
// REP MOVSx is slow if destination address is unaligned.
92+
TESTQ $15,DI
93+
JNZ check_avx
94+
TESTB $const_repmovsPreferred, AX
95+
JNZ fwdBy8
96+
// For backward copy, REP MOVSx performs worse than avx.
97+
check_avx:
98+
TESTB $const_avxSupported, AX
99+
JNZ avxUnaligned
94100

101+
CMPQ BX, $2048
102+
JLS move_256through2048
95103
// Check alignment
96104
MOVL SI, AX
97105
ORL DI, AX
@@ -104,12 +112,16 @@ forward:
104112
RET
105113

106114
fwdBy8:
115+
// Loading the last (possibly partially overlapping) word and writing
116+
// it at the end.
117+
MOVQ -8(SI)(BX*1), AX
118+
LEAQ -8(DI)(BX*1), DX
107119
// Do 8 bytes at a time
108-
MOVQ BX, CX
120+
LEAQ -1(BX),CX
109121
SHRQ $3, CX
110-
ANDQ $7, BX
111122
REP; MOVSQ
112-
JMP tail
123+
MOVQ AX, (DX)
124+
RET
113125

114126
back:
115127
/*
@@ -119,6 +131,9 @@ back:
119131
ADDQ BX, CX
120132
CMPQ CX, DI
121133
JLS forward
134+
135+
TESTB $const_avxSupported, AX
136+
JNZ avxUnaligned
122137
/*
123138
* whole thing backwards has
124139
* adjusted addresses

0 commit comments

Comments
 (0)