From aacc86ad8f2632d730048b153305a917f9864dbe Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 4 Jan 2022 10:55:26 -0500 Subject: [PATCH 1/2] jl_egal Optimization: Defer recursion into pointees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Immutable struct comparisons with `===` can be arbitrarily expensive for deeply recursive but (almost) equal objects. Whenever possible, it's valuable to defer the potentially expensive recursion by first comparing the struct fields for bitwise equality. Before this commit, two structs are compare elementwise, in the order of the struct definition, recursing when pointer fields are encountered. This commit defers the recursion into pointed-to fields until after all other non-pointer fields of the struct are compared. This has two advantages: 1. It defers the expensive part of === comparison as long as possible, in the hopes that we can exit early from dissimilarities discovered elsewhere in the struct instances. 2. It improves cache-locality by scanning the whole struct before jumping into any new places in memory (and reducing comparisons needed on the current cache line after returning from the recursive call). The drawback is that you'll have to scan the pointer fields again, which means potentially more cache misses if the struct is very large. The best way to tell if this is helpful or harmful is benchmarking. Here is the motivating benchmark, which indeed improves by 10x with this commit, compared to master: ```julia julia> using BenchmarkTools julia> struct VN val::Float32 next::Union{VN, Array} # put a mutable thing at the end, to prevent runtime from sharing instances end julia> struct NV next::Union{NV, Array} # put a mutable thing at the end, to prevent runtime from sharing instances val::Float32 end julia> function make_chain_vn(n, sentinal) head = VN(1, sentinal) for i in 2:n head = VN(rand(Int), head) end return head end make_chain_vn (generic function with 1 method) julia> function make_chain_nv(n, sentinal) head = NV(sentinal, 1) for i in 2:n head = NV(head, rand(Int)) end return head end make_chain_nv (generic function with 1 method) julia> vn1, vn2 = make_chain_vn(10000, []), make_chain_vn(10000, []); julia> nv1, nv2 = make_chain_nv(10000, []), make_chain_nv(10000, []); ``` Master: ``` julia> @btime $vn1 === $vn2 7.562 ns (0 allocations: 0 bytes) false julia> @btime $nv1 === $nv2 # slower, since it recurses pointers unnecessarily 76.952 μs (0 allocations: 0 bytes) false ``` After this commit: ``` julia> @btime $vn1 === $vn2 8.597 ns (0 allocations: 0 bytes) false julia> @btime $nv1 === $nv2 # We get to skip the recursion and exit early. :) 10.280 ns (0 allocations: 0 bytes) false ``` However, I think that there are probably other benchmarks where it harms performance, so we'll have to see... For example, here's one: In the exact opposite case as above, if the two objects _are_ (almost) equal, necessitating checking every object, the `NV` comparisons could have exited after all the recursive pointer checks, and never compare the fields, whereas now the fields are checked first, so this gets slower. I'm not exactly sure why the `VN` comparisons get somewhat slower too, but it's maybe because of the second scan mentioned above. ```julia julia> function make_chain_nv(n, sentinal) head = NV(sentinal, 1) for i in 2:n head = NV(head, i) end return head end make_chain_nv (generic function with 1 method) julia> function make_chain_vn(n, sentinal) head = VN(1, sentinal) for i in 2:n head = VN(i, head) end return head end make_chain_vn (generic function with 1 method) julia> vn1, vn2 = make_chain_vn(10000, []), make_chain_vn(10000, []); julia> nv1, nv2 = make_chain_nv(10000, []), make_chain_nv(10000, []); ``` Master: ``` julia> @btime $vn1 === $vn2 95.996 μs (0 allocations: 0 bytes) false julia> @btime $nv1 === $nv2 82.192 μs (0 allocations: 0 bytes) false ``` This commit: ``` julia> @btime $vn1 === $vn2 127.512 μs (0 allocations: 0 bytes) false julia> @btime $nv1 === $nv2 126.837 μs (0 allocations: 0 bytes) ``` --- src/builtins.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/builtins.c b/src/builtins.c index b5368ad36a164..a4b9724782f04 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -78,20 +78,20 @@ static int NOINLINE compare_svec(jl_svec_t *a, jl_svec_t *b) JL_NOTSAFEPOINT // See comment above for an explanation of NOINLINE. static int NOINLINE compare_fields(const jl_value_t *a, const jl_value_t *b, jl_datatype_t *dt) JL_NOTSAFEPOINT { - size_t f, nf = jl_datatype_nfields(dt); - for (f = 0; f < nf; f++) { + size_t nf = jl_datatype_nfields(dt); + // npointers is used at end, but fetched here for locality with nfields. + int npointers = ((jl_datatype_t*)dt)->layout->npointers; + for (size_t f = 0; f < nf; f++) { size_t offs = jl_field_offset(dt, f); char *ao = (char*)a + offs; char *bo = (char*)b + offs; if (jl_field_isptr(dt, f)) { jl_value_t *af = *(jl_value_t**)ao; jl_value_t *bf = *(jl_value_t**)bo; - if (af != bf) { - if (af == NULL || bf == NULL) - return 0; - if (!jl_egal(af, bf)) - return 0; - } + if (af != bf && (af == NULL || bf == NULL)) + return 0; + // Save ptr recursion until the end -- only recurse if otherwise equal + continue; } else { jl_datatype_t *ft = (jl_datatype_t*)jl_field_type_concrete(dt, f); @@ -123,6 +123,18 @@ static int NOINLINE compare_fields(const jl_value_t *a, const jl_value_t *b, jl_ } } } + // If we've gotten here, the objects are bitwise equal, besides their pointer fields. + // Now, we will recurse into jl_egal for the pointed-to elements, which might be + // arbitrarily expensive. + for (size_t p = 0; p < npointers; p++) { + size_t offs = jl_ptr_offset(dt, p); + jl_value_t *af = ((jl_value_t**)a)[offs]; + jl_value_t *bf = ((jl_value_t**)b)[offs]; + if (af != bf) { + if (!jl_egal(af, bf)) + return 0; + } + } return 1; } From ba718e895a8492ce14746906405c13c815a2e8e8 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 10 Jan 2022 12:13:34 -0500 Subject: [PATCH 2/2] Ignore pointer fields completely in the first pass of === Delay the nullptr checks until all non-ptr fields have been compared, since we have to go back to those anyways to follow the pointers. Co-authored-by:Jeff Bezanson --- src/builtins.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/builtins.c b/src/builtins.c index a4b9724782f04..7a4c67c0eff31 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -86,11 +86,9 @@ static int NOINLINE compare_fields(const jl_value_t *a, const jl_value_t *b, jl_ char *ao = (char*)a + offs; char *bo = (char*)b + offs; if (jl_field_isptr(dt, f)) { - jl_value_t *af = *(jl_value_t**)ao; - jl_value_t *bf = *(jl_value_t**)bo; - if (af != bf && (af == NULL || bf == NULL)) - return 0; // Save ptr recursion until the end -- only recurse if otherwise equal + // Note that we also skip comparing the pointers for null here, because + // null fields are rare so it can save CPU to delay this read too. continue; } else { @@ -131,6 +129,8 @@ static int NOINLINE compare_fields(const jl_value_t *a, const jl_value_t *b, jl_ jl_value_t *af = ((jl_value_t**)a)[offs]; jl_value_t *bf = ((jl_value_t**)b)[offs]; if (af != bf) { + if (af == NULL || bf == NULL) + return 0; if (!jl_egal(af, bf)) return 0; }