stage2 array access performance (boundchecks related?) #12215

xxxbxxx · 2022-07-24T11:10:01Z

Zig Version

0.10.0-dev.3286+9ec3f611c

Steps to Reproduce

const std = @import("std");

test {
    const array = try std.testing.allocator.create([10][10 * 10 * 10 * 10]u64);
    defer std.testing.allocator.destroy(array);
    array[1][1] = 0;

    var a: u32 = 1;
    var b: u32 = 1;

    var i: u32 = 0;
    while (i < 12345678) : (i += 1) {
        array[a][b] += array[a][b] * i;
    }

    try std.testing.expect(array[a][b] == 0);
}

Expected Behavior

test passes in <1s
(and does with stage 1)

Actual Behavior

test passes in > 20s (stage 2)

in debug and releasesafe modes.
seems to work fine in releasefast. (there's no memcpy in the llvmir)

xxxbxxx · 2022-07-24T11:14:55Z

Most of the time is spent in memcpy.

looks like array[a][b] first loads array[a] in a temp copy and then does [b]. array[a] is huge!

portion of the air:

          %156 = ptr_elem_ptr(%61, %145!)
          %158 = load([10000]u64, %156!)  // <<<<<
          %159 = load(u32, %87)
          %160 = intcast(usize, %159!)
          %162 = cmp_lt(%160, %161)
          %167!= block(void, {
            %168!= cond_br(%162!, {
              %161!
              %169!= br(%167, @Zir.Inst.Ref.void_value)
            }, {
              %166!= call(%123, [%160, %161!])
            })
          })
          %170 = array_elem_val(%158!, %160!)
          %171 = load(u32, %93)
          %172 = intcast(u64, %171!)
          %174 = mul_with_overflow(%170!, %172!)
          %175 = struct_field_val(%174, 1)

portion of llvm-ir stage 1:

BoundsCheckOk4:                                   ; preds = %BoundsCheckOk
  %32 = getelementptr inbounds [10000 x i64], [10000 x i64]* %28, i64 0, i64 %30, !dbg !3086
  %33 = load i64, i64* %32, align 8, !dbg !3086
  %34 = load i32, i32* %a, align 4, !dbg !3087
  %35 = zext i32 %34 to i64, !dbg !3087
  %36 = load [10 x [10000 x i64]]*, [10 x [10000 x i64]]** %array, align 8, !dbg !3088
  %37 = icmp ult i64 %35, 10, !dbg !3088
  br i1 %37, label %BoundsCheckOk6, label %BoundsCheckFail5, !dbg !3088

BoundsCheckFail5:                                 ; preds = %BoundsCheckOk4
  call fastcc void @std.builtin.default_panic(%"[]u8"* @86, %std.builtin.StackTrace* null), !dbg !3088
  unreachable, !dbg !3088

BoundsCheckOk6:                                   ; preds = %BoundsCheckOk4
  %38 = getelementptr inbounds [10 x [10000 x i64]], [10 x [10000 x i64]]* %36, i64 0, i64 %35, !dbg !3088
  %39 = load i32, i32* %b, align 4, !dbg !3089
  %40 = zext i32 %39 to i64, !dbg !3089
  %41 = icmp ult i64 %40, 10000, !dbg !3090
  br i1 %41, label %BoundsCheckOk8, label %BoundsCheckFail7, !dbg !3090

BoundsCheckFail7:                                 ; preds = %BoundsCheckOk6
  call fastcc void @std.builtin.default_panic(%"[]u8"* @86, %std.builtin.StackTrace* null), !dbg !3090
  unreachable, !dbg !3090

BoundsCheckOk8:                                   ; preds = %BoundsCheckOk6
  %42 = getelementptr inbounds [10000 x i64], [10000 x i64]* %38, i64 0, i64 %40, !dbg !3090
  %43 = load i64, i64* %42, align 8, !dbg !3090
  %44 = load i32, i32* %i, align 4, !dbg !3091
  %45 = zext i32 %44 to i64, !dbg !3091
  %46 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %43, i64 %45), !dbg !3092
  %47 = extractvalue { i64, i1 } %46, 0, !dbg !3092
  %48 = extractvalue { i64, i1 } %46, 1, !dbg !3092
  br i1 %48, label %OverflowFail, label %OverflowOk, !dbg !3092

stage2:

Block5:                                           ; preds = %Then3
  %31 = getelementptr inbounds [10000 x i64], [10000 x i64]* %27, i32 0, i64 %29, !dbg !4998
  %32 = load i64, i64* %31, align 8, !dbg !4998
  %33 = load i32, i32* %8, align 4, !dbg !4998
  %34 = zext i32 %33 to i64, !dbg !4998
  %35 = icmp ult i64 %34, 10, !dbg !4998
  br i1 %35, label %Then6, label %Else7, !dbg !4998

Then6:                                            ; preds = %Block5
  br label %Block8, !dbg !4998

Else7:                                            ; preds = %Block5
  call fastcc void @builtin.panicOutOfBounds(i64 %34, i64 10), !dbg !4998
  unreachable, !dbg !4998

Block8:                                           ; preds = %Then6
  %36 = getelementptr inbounds [10 x [10000 x i64]], [10 x [10000 x i64]]* %17, i32 0, i64 %34, !dbg !4998
  %37 = bitcast [10000 x i64]* %5 to i8*, !dbg !4998
  %38 = bitcast [10000 x i64]* %36 to i8*, !dbg !4998
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %37, i8* align 8 %38, i64 80000, i1 false), !dbg !4998
  %39 = load i32, i32* %7, align 4, !dbg !4998
  %40 = zext i32 %39 to i64, !dbg !4998
  %41 = icmp ult i64 %40, 10000, !dbg !4998
  br i1 %41, label %Then9, label %Else10, !dbg !4998

Then9:                                            ; preds = %Block8
  br label %Block11, !dbg !4998

Else10:                                           ; preds = %Block8
  call fastcc void @builtin.panicOutOfBounds(i64 %40, i64 10000), !dbg !4998
  unreachable, !dbg !4998

Block11:                                          ; preds = %Then9
  %42 = getelementptr inbounds [10000 x i64], [10000 x i64]* %5, i32 0, i64 %40, !dbg !4998
  %43 = load i64, i64* %42, align 8, !dbg !4998
  %44 = load i32, i32* %6, align 4, !dbg !4998
  %45 = zext i32 %44 to i64, !dbg !4998
  %46 = call fastcc { i64, i1 } @llvm.umul.with.overflow.i64(i64 %43, i64 %45), !dbg !4998
  %47 = extractvalue { i64, i1 } %46, 0, !dbg !4998
  %48 = extractvalue { i64, i1 } %46, 1, !dbg !4998
  %49 = getelementptr inbounds %2, %2* %4, i32 0, i32 0, !dbg !4998
  store i64 %47, i64* %49, align 8, !dbg !4998
  %50 = getelementptr inbounds %2, %2* %4, i32 0, i32 1, !dbg !4998
  store i1 %48, i1* %50, align 1, !dbg !4998
  %51 = getelementptr inbounds %2, %2* %4, i32 0, i32 1, !dbg !4998
  %52 = load i1, i1* %51, align 1, !dbg !4998
  %53 = icmp eq i1 %52, false, !dbg !4998
  br i1 %53, label %Then12, label %Else13, !dbg !4998

IntegratedQuantum · 2022-10-01T09:28:52Z

To everyone who is looking for a temporary solution:
array[a][b] → (&array[a])[b]
strct.array[i] → (&strct.array)[i]

This solves the performance issue because it seems to be storing a pointer to the array instead of the whole array:
%5 = alloca [10000 x i64] changed to %5 = alloca ptr in the llvm ir

topolarity · 2022-10-02T17:42:33Z

Reduction:

test {
    var array: [10][10_000]u64 = undefined;
    var array_ptr = &array;
    var a: usize = 1;
    _ = array_ptr[a][a];
}

The LLVM IR generated for this by stage2 in -O ReleaseSafe is:

define internal fastcc i16 @test3.test_0() unnamed_addr #0 {
Entry:
  %0 = alloca [10000 x i64], align 8
  ; ... omitted for brevity

Block:                                            ; preds = %Then
  %7 = getelementptr inbounds [10 x [10000 x i64]], ptr %4, i32 0, i64 %5
  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %0, ptr align 8 %7, i64 80000, i1 false)
  %8 = load i64, ptr %1, align 8
  %9 = icmp ult i64 %8, 10000
  br i1 %9, label %Then1, label %Else2

Then1:                                            ; preds = %Block
  br label %Block3

Else2:                                            ; preds = %Block
  call fastcc void @builtin.panicOutOfBounds(i64 %8, i64 10000)
  unreachable

Block3:                                           ; preds = %Then1
  ret i16 0
}

Notice the intermediate load/memcpy of the inner array element, which is very slow. The load indeed seems to be from the code inserted for the inbounds safety check.

In Release mode, LLVM has no problem doing the optimization for us. In -O ReleaseSafe the intermediate load is gone and the test passes in <1s, just like in stage1.

topolarity · 2022-10-02T18:36:51Z

The problem boils down to this optimization in src/codegen/llvm.zig (fn airLoad):

// It would be valid to fall back to the code below here that simply calls
// load(). However, as an optimization, we want to avoid unnecessary copies
// of isByRef=true types. Here, we scan forward in the current block,
// looking to see if this load dies before any side effects occur.
// In such case, we can safely return the operand without making a copy.
for (body[body_i..]) |body_inst| {
    switch (self.liveness.categorizeOperand(self.air, body_inst, inst)) {
        .none => continue,
        .write, .noret, .complex => break :elide,
        .tomb => return try self.resolveInst(ty_op.operand),
    }
} else unreachable;

The bounds check inserts a block between the load of the inner array and its last usage. Liveness conservatively decides this block has side effects without analyzing it, and the optimization is not applied.

Vexu · 2022-10-03T09:57:36Z

Note that the issue starts in AstGen where array and field access forces the lhs to be loaded even though the operation could just as well be done on a pointer.

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves ziglang#12215

@Vexu

This change adds to Liveness a simple pattern match for the try-like `.condbr` blocks emitted by Sema's safety checks. This allows us to determine that these do not modify memory, which permits us to elide additional loads in the backend. As @Vexu points out in the main issue, this is probably not a complete solution on its own. We'll still want a way to reliably narrow the load/copy when performing several consecutive accesses, such as `foo.arr[x][y].z` Resolves #12215

…mory followup to [25d3713] Resolves ziglang#12215 Previous code didn't account for the extra unreach() that now exists in the air: ``` %29!= block(void, { %30!= cond_br(%22!, { %31!= br(%29, @Air.Inst.Ref.void_value) }, { %2! %15! %27!= call(%26, [%19!, %21]) %28!= unreach() }) } %22!) ```

followup to [25d3713] Resolves ziglang#12215 Previous code didn't account for the extra unreach() that now exists in the air: ``` %29!= block(void, { %30!= cond_br(%22!, { %31!= br(%29, @Air.Inst.Ref.void_value) }, { %2! %15! %27!= call(%26, [%19!, %21]) %28!= unreach() }) } %22!) ```

followup to [25d3713] Resolves #12215 Previous code didn't account for the extra unreach() that now exists in the air: ``` %29!= block(void, { %30!= cond_br(%22!, { %31!= br(%29, @Air.Inst.Ref.void_value) }, { %2! %15! %27!= call(%26, [%19!, %21]) %28!= unreach() }) } %22!) ```

xxxbxxx added the bug Observed behavior contradicts documented or intended behavior label Jul 24, 2022

xxxbxxx changed the title ~~stage2 array access performance~~ stage2 array access performance (boundchecks related?) Jul 24, 2022

Vexu added the frontend Tokenization, parsing, AstGen, Sema, and Liveness. label Jul 24, 2022

Vexu added this to the 0.10.0 milestone Jul 24, 2022

Vexu mentioned this issue Jul 25, 2022

stage2 can't load elements from arrays in structs #12236

Closed

This was referenced Aug 22, 2022

Stage2: Stack overflow iterating over large array #12568

Closed

Stage 2 performance regression in debug: big structs as function parameters are passed by value #12638

Closed

Vexu mentioned this issue Sep 2, 2022

Performance regression with self-hosted compiler #12713

Closed

topolarity self-assigned this Oct 2, 2022

topolarity linked a pull request Oct 5, 2022 that will close this issue

stage2: Miscellaneous fixes to vector arithmetic and copy elision #13074

Merged

andrewrk mentioned this issue Oct 16, 2022

stage2: cannot determine length of array in comptime block #12240

Closed

Vexu mentioned this issue Oct 16, 2022

Segfault Before Main with Self-hosted Compiler #13190

Open

topolarity mentioned this issue Oct 20, 2022

stage2: Teach Liveness that some safety checks do not observably modify memory #13233

Merged

topolarity removed a link to a pull request Oct 20, 2022

stage2: Miscellaneous fixes to vector arithmetic and copy elision #13074

Merged

andrewrk modified the milestones: 0.10.0, 0.10.1 Oct 30, 2022

nektro mentioned this issue Nov 11, 2022

Bcrypt large speed regression #13510

Open

andrewrk closed this as completed in #13233 Dec 12, 2022

andrewrk modified the milestones: 0.10.1, 0.11.0 Dec 12, 2022

IntegratedQuantum mentioned this issue Dec 14, 2022

Another array access performance issue. #13938

Open

xxxbxxx mentioned this issue Jun 24, 2023

fix array boundchecks performance regression (debug builds) #16179

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

stage2 array access performance (boundchecks related?) #12215

stage2 array access performance (boundchecks related?) #12215

xxxbxxx commented Jul 24, 2022 •

edited

Loading

xxxbxxx commented Jul 24, 2022 •

edited

Loading

Uh oh!

IntegratedQuantum commented Oct 1, 2022

Uh oh!

topolarity commented Oct 2, 2022 •

edited

Loading

Uh oh!

topolarity commented Oct 2, 2022

Uh oh!

Vexu commented Oct 3, 2022

Uh oh!

Uh oh!

stage2 array access performance (boundchecks related?) #12215

stage2 array access performance (boundchecks related?) #12215

Comments

xxxbxxx commented Jul 24, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Zig Version

Steps to Reproduce

Expected Behavior

Actual Behavior

xxxbxxx commented Jul 24, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

IntegratedQuantum commented Oct 1, 2022

Uh oh!

topolarity commented Oct 2, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

topolarity commented Oct 2, 2022

Uh oh!

Vexu commented Oct 3, 2022

Uh oh!

xxxbxxx commented Jul 24, 2022 •

edited

Loading

xxxbxxx commented Jul 24, 2022 •

edited

Loading

topolarity commented Oct 2, 2022 •

edited

Loading