Skip to content

Iterating with ForEach over ImmutableArray is slower than over Array #780

@hnrqbaggio

Description

@hnrqbaggio

This is a spin off issue from https://github.com/dotnet/corefx/issues/36416, to scope down to just the ImmutableArray case.

As mentioned in other issues, the immutability sometimes comes with trade-offs in some operations, but in this case it seems that the extra overhead can be optimized, at least for when it's a collection of value type.

Comparing with Array

Looking at the ASM code generated, the Array version is high inlined, while for the ImmutableArray the JIT is able to inline the loop itself, but not the call to ImmutableArray<T>.GetEnumerator(). The method itself is quite simple, but calls to an internal method called ThrowNullRefIfNotInitialized() to validate that the underlying array is not null.

It seems that the extra method causes the collection to have more branch mis-predictions and cache misses than the array case (the cache misses show up in the results when the collection is larger, say 2048 instead of the default 512 elements).

Possible fix

If we change the implementation of GetEnumerator to use MethodImplOptions.AggressiveInlining, the JIT is able to inline the call and the stats of both benchmarks match and the Median for the Int32 case improves by 4x.

No Slower results for the provided threshold = 3% and noise filter = 0.3ns.
Faster base/diff Base Median (ns) Diff Median (ns) Modality
IterateForEach.ImmutableArray(Size: 512) 4.07 1110.44 273.14 several?

Unfortunately, this seems to not be enough for the Reference Type case. When the benchmark runs using String instead of Int32 the results are still the same as before, so it seems that marking the methods as inline is still not sufficient for the JIT to optimize them.

Correctness

The code in ThrowNullRefIfNotInitialized is just accessing the underlying Array.Length property and relying on that to throw if the object is null.

In the optimized version, I can't see the exact same instructions, so would still need to confirm that the optimizer is not discarding that check. However, the tests that validate that GetEnumerator will throw NRE in that condition did pass.

I have the changes above in my fork of the repo, but this is my first contribution so would like to check the feedback on the findings and not send a PR right away. 😁

Baseline

BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
  [Host]     : .NET Framework 4.8 (4.8.4075.0), X64 RyuJIT
  Job-RUPWLA : .NET Core 5.0.0 (CoreCLR 5.0.19.61001, CoreFX 5.0.19.61001), X64 RyuJIT

PowerPlanMode=00000000-0000-0000-0000-000000000000  Toolchain=CoreRun  IterationTime=250.0000 ms  
MaxIterationCount=20  MinIterationCount=15  WarmupCount=1  
Type Method Size Mean Error StdDev Median Min Max Gen 0 Gen 1 Gen 2 Allocated BranchInstructions/Op CacheMisses/Op BranchMispredictions/Op
IterateForEach<Int32> Array 512 173.4 ns 18.47 ns 21.28 ns 177.2 ns 145.1 ns 199.3 ns - - - - 212 0 1
IterateForEach<String> Array 512 157.5 ns 15.91 ns 18.32 ns 169.4 ns 131.8 ns 179.9 ns - - - - 205 0 1
IterateForEach<Int32> ImmutableArray 512 1,083.9 ns 38.67 ns 44.53 ns 1,110.4 ns 997.1 ns 1,115.4 ns - - - - 415 0 2
IterateForEach<String> ImmutableArray 512 1,101.5 ns 38.80 ns 44.69 ns 1,115.3 ns 1,010.2 ns 1,175.2 ns - - - - 462 0 2

Array

; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].Array()
       xor     eax,eax
       mov     rdx,qword ptr [rcx+8]
       xor     ecx,ecx
       mov     r8d,dword ptr [rdx+8]
       test    r8d,r8d
       jle     M00_L01
M00_L00:
       movsxd  rax,ecx
       mov     eax,dword ptr [rdx+rax*4+10h]
       inc     ecx
       cmp     r8d,ecx
       jg      M00_L00
M00_L01:
       ret
       sbb     dword ptr [rax],eax
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax-76h],dh
       rcr     dword ptr [rdx-6],cl
       jg      M00_L02
M00_L02:
       add     byte ptr [rbp+48h],dl
       mov     ebp,esp
       mov     qword ptr [rbp+10h],rcx
       mov     rax,qword ptr [rbp+10h]
       mov     rax,qword ptr [rax+58h]
; Total bytes of code 64

ImmutableArray

; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].ImmutableArray()
       push    rsi
       sub     rsp,40h
       xor     eax,eax
       mov     qword ptr [rsp+38h],rax
       mov     qword ptr [rsp+28h],rax
       mov     qword ptr [rsp+30h],rax
       xor     esi,esi
       mov     rcx,qword ptr [rcx+0C0h]
       mov     qword ptr [rsp+38h],rcx
       lea     rcx,[rsp+38h]
       lea     rdx,[rsp+28h]
       call    System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].GetEnumerator()
       jmp     M00_L01
M00_L00:
       cmp     dword ptr [rsp+30h],edx
       jae     M00_L02
       mov     rax,qword ptr [rsp+28h]
       mov     edx,dword ptr [rsp+30h]
       movsxd  rdx,edx
       mov     esi,dword ptr [rax+rdx*4+10h]
M00_L01:
       mov     eax,dword ptr [rsp+30h]
       inc     eax
       mov     dword ptr [rsp+30h],eax
       mov     rdx,qword ptr [rsp+28h]
       mov     edx,dword ptr [rdx+8]
       cmp     edx,eax
       jg      M00_L00
       mov     eax,esi
       add     rsp,40h
       pop     rsi
       ret
M00_L02:
       call    CoreCLR!JIT_RngChkFail
       int     3
       add     byte ptr [rcx],bl
       add     eax,72050002h
       add     dword ptr [rax+40h],esp
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax-26h],ah
; Total bytes of code 138
; System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].GetEnumerator()
       push    rbp
       push    rdi
       push    rsi
       sub     rsp,40h
       vzeroupper
       lea     rbp,[rsp+50h]
       xor     eax,eax
       mov     qword ptr [rbp-18h],rax
       mov     qword ptr [rbp-28h],rax
       mov     qword ptr [rbp+10h],rcx
       mov     qword ptr [rbp+18h],rdx
       mov     rcx,qword ptr [rbp+10h]
       mov     rcx,qword ptr [rcx]
       mov     qword ptr [rbp-18h],rcx
       lea     rcx,[rbp-18h]
       call    System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].ThrowNullRefIfNotInitialized()
       vxorps  xmm0,xmm0,xmm0
       vmovdqu xmmword ptr [rbp-28h],xmm0
       lea     rcx,[rbp-28h]
       mov     rdx,qword ptr [rbp-18h]
       call    System.Collections.Immutable.ImmutableArray`1+Enumerator[[System.Int32, System.Private.CoreLib]]..ctor(Int32[])
       mov     rdi,qword ptr [rbp+18h]
       lea     rsi,[rbp-28h]
       call    CoreCLR!JIT_ByRefWriteBarrier
       movs    qword ptr [rdi],qword ptr [rsi]
       mov     rax,qword ptr [rbp+18h]
       lea     rsp,[rbp-10h]
       pop     rsi
       pop     rdi
       pop     rbp
       ret
       int     3
       int     3
       sbb     dword ptr [rdi],eax
       add     al,0
       ???
       jb      00007ffa`5ad52c12
       ???
       add     dh,byte ptr [rax+1]
       push    rax
       add     byte ptr [rax],al
       add     al,cl
       fcmovbe st,st(4)
       pop     rdx
       cli
       jg      00007ffa`5ad52c1f
; Total bytes of code 127
; System.Collections.Immutable.ImmutableArray`1[[System.Int32, System.Private.CoreLib]].ThrowNullRefIfNotInitialized()
       push    rbp
       mov     rbp,rsp
       mov     qword ptr [rbp+10h],rcx
       mov     rax,qword ptr [rbp+10h]
       mov     rax,qword ptr [rax]
       mov     eax,dword ptr [rax+8]
       pop     rbp
       ret
       sbb     dword ptr [rcx],eax
       add     dword ptr [rax],eax
       add     dword ptr [rax],edx
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       and     bl,bl
       ???
       pop     rdx
       cli
       jg      M02_L00
M02_L00:
       add     byte ptr [rbp+48h],dl
; Total bytes of code 50

With Aggressive Inlining

BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
  [Host]     : .NET Framework 4.8 (4.8.4075.0), X64 RyuJIT
  Job-RUPWLA : .NET Core 5.0.0 (CoreCLR 5.0.19.61001, CoreFX 5.0.19.61001), X64 RyuJIT

PowerPlanMode=00000000-0000-0000-0000-000000000000  Toolchain=CoreRun  IterationTime=250.0000 ms  
MaxIterationCount=20  MinIterationCount=15  WarmupCount=1  
Type Method Size Mean Error StdDev Median Min Max Gen 0 Gen 1 Gen 2 Allocated BranchInstructions/Op CacheMisses/Op BranchMispredictions/Op
IterateForEach<Int32> Array 512 156.5 ns 6.94 ns 7.71 ns 154.1 ns 148.4 ns 173.3 ns - - - - 223 0 1
IterateForEach<String> Array 512 140.1 ns 4.23 ns 4.87 ns 138.7 ns 131.3 ns 148.8 ns - - - - 226 0 1
IterateForEach<Int32> ImmutableArray 512 271.6 ns 24.17 ns 27.84 ns 273.1 ns 233.7 ns 301.4 ns - - - - 424 0 2
IterateForEach<String> ImmutableArray 512 1,125.9 ns 107.31 ns 123.58 ns 1,071.1 ns 997.4 ns 1,293.6 ns - - - - 434 0 2

Array

; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].Array()
       xor     eax,eax
       mov     rdx,qword ptr [rcx+8]
       xor     ecx,ecx
       mov     r8d,dword ptr [rdx+8]
       test    r8d,r8d
       jle     M00_L01
M00_L00:
       movsxd  rax,ecx
       mov     eax,dword ptr [rdx+rax*4+10h]
       inc     ecx
       cmp     r8d,ecx
       jg      M00_L00
M00_L01:
       ret
       sbb     dword ptr [rax],eax
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax-76h],dh
       fistp   word ptr [rcx-6]
       jg      M00_L02
M00_L02:
       add     byte ptr [rbp+48h],dl
       mov     ebp,esp
       mov     qword ptr [rbp+10h],rcx
       mov     rax,qword ptr [rbp+10h]
       mov     rax,qword ptr [rax+58h]
; Total bytes of code 64

ImmutableArray

; System.Collections.IterateForEach`1[[System.Int32, System.Private.CoreLib]].ImmutableArray()
       sub     rsp,28h
       xor     eax,eax
       mov     rdx,qword ptr [rcx+0C0h]
       mov     ecx,dword ptr [rdx+8]
       mov     r8d,0FFFFFFFFh
       jmp     M00_L01
M00_L00:
       cmp     r8d,ecx
       jae     M00_L02
       movsxd  rax,r8d
       mov     eax,dword ptr [rdx+rax*4+10h]
M00_L01:
       inc     r8d
       cmp     ecx,r8d
       jg      M00_L00
       add     rsp,28h
       ret
M00_L02:
       call    CoreCLR!JIT_RngChkFail
       int     3
       add     byte ptr [rcx],bl
       add     al,1
       add     byte ptr [rdx+rax*2],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax],al
       add     byte ptr [rax-26h],dl
       loopne  00007ffa`59e12bb5
       cli
       jg      M00_L03
M00_L03:
       add     byte ptr [rbp+48h],dl
; Total bytes of code 82

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions