-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
This is a spin off issue from https://github.com/dotnet/corefx/issues/36416, to scope down to just the ImmutableArray case.
As mentioned in other issues, the immutability sometimes comes with trade-offs in some operations, but in this case it seems that the extra overhead can be optimized, at least for when it's a collection of value type.
Comparing with Array
Looking at the ASM code generated, the Array version is high inlined, while for the ImmutableArray the JIT is able to inline the loop itself, but not the call to ImmutableArray<T>.GetEnumerator()
. The method itself is quite simple, but calls to an internal method called ThrowNullRefIfNotInitialized()
to validate that the underlying array is not null.
It seems that the extra method causes the collection to have more branch mis-predictions and cache misses than the array case (the cache misses show up in the results when the collection is larger, say 2048 instead of the default 512 elements).
Possible fix
If we change the implementation of GetEnumerator to use MethodImplOptions.AggressiveInlining, the JIT is able to inline the call and the stats of both benchmarks match and the Median for the Int32 case improves by 4x.
No Slower results for the provided threshold = 3% and noise filter = 0.3ns.
Faster | base/diff | Base Median (ns) | Diff Median (ns) | Modality |
---|---|---|---|---|
IterateForEach.ImmutableArray(Size: 512) | 4.07 | 1110.44 | 273.14 | several? |
Unfortunately, this seems to not be enough for the Reference Type case. When the benchmark runs using String instead of Int32 the results are still the same as before, so it seems that marking the methods as inline is still not sufficient for the JIT to optimize them.
Correctness
The code in ThrowNullRefIfNotInitialized
is just accessing the underlying Array.Length
property and relying on that to throw if the object is null.
In the optimized version, I can't see the exact same instructions, so would still need to confirm that the optimizer is not discarding that check. However, the tests that validate that GetEnumerator will throw NRE in that condition did pass.
I have the changes above in my fork of the repo, but this is my first contribution so would like to check the feedback on the findings and not send a PR right away. 😁
Baseline
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
[Host] : .NET Framework 4.8 (4.8.4075.0), X64 RyuJIT
Job-RUPWLA : .NET Core 5.0.0 (CoreCLR 5.0.19.61001, CoreFX 5.0.19.61001), X64 RyuJIT
PowerPlanMode=00000000-0000-0000-0000-000000000000 Toolchain=CoreRun IterationTime=250.0000 ms
MaxIterationCount=20 MinIterationCount=15 WarmupCount=1
Type | Method | Size | Mean | Error | StdDev | Median | Min | Max | Gen 0 | Gen 1 | Gen 2 | Allocated | BranchInstructions/Op | CacheMisses/Op | BranchMispredictions/Op |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
IterateForEach<Int32> | Array | 512 | 173.4 ns | 18.47 ns | 21.28 ns | 177.2 ns | 145.1 ns | 199.3 ns | - | - | - | - | 212 | 0 | 1 |
IterateForEach<String> | Array | 512 | 157.5 ns | 15.91 ns | 18.32 ns | 169.4 ns | 131.8 ns | 179.9 ns | - | - | - | - | 205 | 0 | 1 |
IterateForEach<Int32> | ImmutableArray | 512 | 1,083.9 ns | 38.67 ns | 44.53 ns | 1,110.4 ns | 997.1 ns | 1,115.4 ns | - | - | - | - | 415 | 0 | 2 |
IterateForEach<String> | ImmutableArray | 512 | 1,101.5 ns | 38.80 ns | 44.69 ns | 1,115.3 ns | 1,010.2 ns | 1,175.2 ns | - | - | - | - | 462 | 0 | 2 |
Array
; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].Array()
xor eax,eax
mov rdx,qword ptr [rcx+8]
xor ecx,ecx
mov r8d,dword ptr [rdx+8]
test r8d,r8d
jle M00_L01
M00_L00:
movsxd rax,ecx
mov eax,dword ptr [rdx+rax*4+10h]
inc ecx
cmp r8d,ecx
jg M00_L00
M00_L01:
ret
sbb dword ptr [rax],eax
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax-76h],dh
rcr dword ptr [rdx-6],cl
jg M00_L02
M00_L02:
add byte ptr [rbp+48h],dl
mov ebp,esp
mov qword ptr [rbp+10h],rcx
mov rax,qword ptr [rbp+10h]
mov rax,qword ptr [rax+58h]
; Total bytes of code 64
ImmutableArray
; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].ImmutableArray()
push rsi
sub rsp,40h
xor eax,eax
mov qword ptr [rsp+38h],rax
mov qword ptr [rsp+28h],rax
mov qword ptr [rsp+30h],rax
xor esi,esi
mov rcx,qword ptr [rcx+0C0h]
mov qword ptr [rsp+38h],rcx
lea rcx,[rsp+38h]
lea rdx,[rsp+28h]
call System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].GetEnumerator()
jmp M00_L01
M00_L00:
cmp dword ptr [rsp+30h],edx
jae M00_L02
mov rax,qword ptr [rsp+28h]
mov edx,dword ptr [rsp+30h]
movsxd rdx,edx
mov esi,dword ptr [rax+rdx*4+10h]
M00_L01:
mov eax,dword ptr [rsp+30h]
inc eax
mov dword ptr [rsp+30h],eax
mov rdx,qword ptr [rsp+28h]
mov edx,dword ptr [rdx+8]
cmp edx,eax
jg M00_L00
mov eax,esi
add rsp,40h
pop rsi
ret
M00_L02:
call CoreCLR!JIT_RngChkFail
int 3
add byte ptr [rcx],bl
add eax,72050002h
add dword ptr [rax+40h],esp
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax-26h],ah
; Total bytes of code 138
; System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].GetEnumerator()
push rbp
push rdi
push rsi
sub rsp,40h
vzeroupper
lea rbp,[rsp+50h]
xor eax,eax
mov qword ptr [rbp-18h],rax
mov qword ptr [rbp-28h],rax
mov qword ptr [rbp+10h],rcx
mov qword ptr [rbp+18h],rdx
mov rcx,qword ptr [rbp+10h]
mov rcx,qword ptr [rcx]
mov qword ptr [rbp-18h],rcx
lea rcx,[rbp-18h]
call System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].ThrowNullRefIfNotInitialized()
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rbp-28h],xmm0
lea rcx,[rbp-28h]
mov rdx,qword ptr [rbp-18h]
call System.Collections.Immutable.ImmutableArray`1+Enumerator[[System.Int32, System.Private.CoreLib]]..ctor(Int32[])
mov rdi,qword ptr [rbp+18h]
lea rsi,[rbp-28h]
call CoreCLR!JIT_ByRefWriteBarrier
movs qword ptr [rdi],qword ptr [rsi]
mov rax,qword ptr [rbp+18h]
lea rsp,[rbp-10h]
pop rsi
pop rdi
pop rbp
ret
int 3
int 3
sbb dword ptr [rdi],eax
add al,0
???
jb 00007ffa`5ad52c12
???
add dh,byte ptr [rax+1]
push rax
add byte ptr [rax],al
add al,cl
fcmovbe st,st(4)
pop rdx
cli
jg 00007ffa`5ad52c1f
; Total bytes of code 127
; System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].ThrowNullRefIfNotInitialized()
push rbp
mov rbp,rsp
mov qword ptr [rbp+10h],rcx
mov rax,qword ptr [rbp+10h]
mov rax,qword ptr [rax]
mov eax,dword ptr [rax+8]
pop rbp
ret
sbb dword ptr [rcx],eax
add dword ptr [rax],eax
add dword ptr [rax],edx
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
and bl,bl
???
pop rdx
cli
jg M02_L00
M02_L00:
add byte ptr [rbp+48h],dl
; Total bytes of code 50
With Aggressive Inlining
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
[Host] : .NET Framework 4.8 (4.8.4075.0), X64 RyuJIT
Job-RUPWLA : .NET Core 5.0.0 (CoreCLR 5.0.19.61001, CoreFX 5.0.19.61001), X64 RyuJIT
PowerPlanMode=00000000-0000-0000-0000-000000000000 Toolchain=CoreRun IterationTime=250.0000 ms
MaxIterationCount=20 MinIterationCount=15 WarmupCount=1
Type | Method | Size | Mean | Error | StdDev | Median | Min | Max | Gen 0 | Gen 1 | Gen 2 | Allocated | BranchInstructions/Op | CacheMisses/Op | BranchMispredictions/Op |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
IterateForEach<Int32> | Array | 512 | 156.5 ns | 6.94 ns | 7.71 ns | 154.1 ns | 148.4 ns | 173.3 ns | - | - | - | - | 223 | 0 | 1 |
IterateForEach<String> | Array | 512 | 140.1 ns | 4.23 ns | 4.87 ns | 138.7 ns | 131.3 ns | 148.8 ns | - | - | - | - | 226 | 0 | 1 |
IterateForEach<Int32> | ImmutableArray | 512 | 271.6 ns | 24.17 ns | 27.84 ns | 273.1 ns | 233.7 ns | 301.4 ns | - | - | - | - | 424 | 0 | 2 |
IterateForEach<String> | ImmutableArray | 512 | 1,125.9 ns | 107.31 ns | 123.58 ns | 1,071.1 ns | 997.4 ns | 1,293.6 ns | - | - | - | - | 434 | 0 | 2 |
Array
; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].Array()
xor eax,eax
mov rdx,qword ptr [rcx+8]
xor ecx,ecx
mov r8d,dword ptr [rdx+8]
test r8d,r8d
jle M00_L01
M00_L00:
movsxd rax,ecx
mov eax,dword ptr [rdx+rax*4+10h]
inc ecx
cmp r8d,ecx
jg M00_L00
M00_L01:
ret
sbb dword ptr [rax],eax
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax-76h],dh
fistp word ptr [rcx-6]
jg M00_L02
M00_L02:
add byte ptr [rbp+48h],dl
mov ebp,esp
mov qword ptr [rbp+10h],rcx
mov rax,qword ptr [rbp+10h]
mov rax,qword ptr [rax+58h]
; Total bytes of code 64
ImmutableArray
; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].ImmutableArray()
sub rsp,28h
xor eax,eax
mov rdx,qword ptr [rcx+0C0h]
mov ecx,dword ptr [rdx+8]
mov r8d,0FFFFFFFFh
jmp M00_L01
M00_L00:
cmp r8d,ecx
jae M00_L02
movsxd rax,r8d
mov eax,dword ptr [rdx+rax*4+10h]
M00_L01:
inc r8d
cmp ecx,r8d
jg M00_L00
add rsp,28h
ret
M00_L02:
call CoreCLR!JIT_RngChkFail
int 3
add byte ptr [rcx],bl
add al,1
add byte ptr [rdx+rax*2],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax],al
add byte ptr [rax-26h],dl
loopne 00007ffa`59e12bb5
cli
jg M00_L03
M00_L03:
add byte ptr [rbp+48h],dl
; Total bytes of code 82