Skip to content

[vectorization] gcc generate better code for a loop body with control flow #64292

Open
@zhongsir1

Description

@zhongsir1

test: https://godbolt.org/z/7Y6a4EvW9

void s1161(void)
{
        for (int i = 0; i < LEN_1D-1; ++i) {
            if (c[i] < (real_t)0.) {
                goto L20;
            }
            a[i] = c[i] + d[i] * e[i];
            goto L10;
L20:
            b[i] = a[i] + d[i] * d[i];
L10:
            ;
        }
}
  • gcc: generate a sve loop
.L2:
  lsl x1, x0, 3
  ld1d z31.d, p7/z, [x9, x0, lsl 3]
  ld1d z29.d, p7/z, [x8, x0, lsl 3]
  add x2, x7, x1
  fcmlt p6.d, p7/z, z31.d, #0.0
  ld1d z30.d, p7/z, [x2]
  not p6.b, p7/z, p6.b
  add x4, x5, x1
  add x1, x6, x1
  ld1d z28.d, p7/z, [x4]
  fcmlt p7.d, p7/z, z31.d, #0.0
  fmla z31.d, p6/m, z29.d, z28.d
  fmla z30.d, p7/m, z29.d, z29.d
  st1d z31.d, p6, [x2]
  st1d z30.d, p7, [x1]
  add x0, x0, x10
  whilelo p7.d, w0, w3
  b.any .L2
  • llvm: failed with vectorization
.LBB0_1: // in Loop: Header=BB0_2 Depth=1
  ldr d0, [x11, x8]
  mov x15, x13
  ldr d1, [x12, x8]
  fmadd d0, d1, d1, d0
  str d0, [x13, x8]
  add x8, x8, #8
  cmp x8, x9
  b.eq .LBB0_4
.LBB0_2: // =>This Inner Loop Header: Depth=1
  ldr d0, [x10, x8]
  fcmp d0, #0.0
  b.mi .LBB0_1
// %bb.3: // in Loop: Header=BB0_2 Depth=1
  ldr d1, [x12, x8]
  mov x15, x11
  ldr d2, [x14, x8]
  fmadd d0, d1, d2, d0
  str d0, [x11, x8]
  add x8, x8, #8
  cmp x8, x9
  b.ne .LBB0_2

Activity

vfdff

vfdff commented on Aug 3, 2023

@vfdff
Contributor
void s1161(void) {
  for (int i = 0; i < LEN_1D-1; ++i) {
    if (c[i] < (real_t)0)
      b[i] = a[i] + d[i];
    else
      a[i] = c[i] + d[i];
  }
}
  • gcc:
.L2:
        lsl     x1, x0, 3                    ; i * 8, double ?
        ld1d    z0.d, p0/z, [x6, x0, lsl 3]  ; d[i]
        ld1d    z2.d, p0/z, [x7, x0, lsl 3]  ; c[i]
        add     x2, x5, x1                   ; &a[i]
        fcmlt   p2.d, p0/z, z2.d, #0.0       ; if (c[i] < (real_t)0.)
        ld1d    z1.d, p0/z, [x2]             ; a[i]
        add     x1, x4, x1                   ; &b[i]
        fadd    z1.d, z0.d, z1.d             ; b[i] = a[i] + d[i];
        fcmge   p0.d, p0/z, z2.d, #0.0       ; if (c[i] >= (real_t)0.)
        fadd    z0.d, z2.d, z0.d             ; a[i] = c[i] + d[i];
        st1d    z1.d, p2, [x1]
        st1d    z0.d, p0, [x2]
        incd    x0
        whilelo p0.d, w0, w3
        b.any   .L2
  • llvm: LV: Can't vectorize due to memory conflicts, -mllvm -debug-only=loop-accesses
LAA: We can't vectorize because we can't find the array bounds
vfdff

vfdff commented on Aug 15, 2023

@vfdff
Contributor
  • Add option -mllvm -simplifycfg-sink-common=false can vectorize the loop https://godbolt.org/z/hhcjxGEG7
    • [LAA] Analyze pointers forked by a select,D108699
    • [LAA] Init analyze pointers forked by a phi,D102266 expect the following PHI node form
     if.then:      %gep.1 = getelementptr inbounds double, ptr %B, i64 %iv
     if.else:      %gep.2 = getelementptr inbounds double, ptr %C, i64 %iv
     loop.latch:%gep.2.sink = phi ptr [ %gep.2, %if.else ], [ %gep.1, %if.then ]
    
vfdff

vfdff commented on Aug 19, 2023

@vfdff
Contributor

s1161_ptr is a more simple case, https://godbolt.org/z/xaqacGo81,
which may require similar extension as the D114480 for PHINode

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

      Development

      No branches or pull requests

        Participants

        @vfdff@EugeneZelenko@zhongsir1

        Issue actions

          [vectorization] gcc generate better code for a loop body with control flow · Issue #64292 · llvm/llvm-project