Skip to content

Commit 0328528

Browse files
committed
[LV] Relax high loop trip count threshold for deciding to interleave the loop
The current loop trip count threshold to allow loop interleaving is 128 which seems arbitrarily high & uncorrelated with factors like VW, IC, register pressure etc. A set of microbenchmarks in llvm-test-suite (llvm/llvm-test-suite#26), when tested on a AArch64 platform, shows that loop interleaving is beneficial even for loops with low trip counts. We have also found similar evidence in an application benchmark that when compiled with PGO shows a 40% regression when it's hot loop with profile-guided trip count of 24 doesn't get interleaved because of this threshold. Therefore, it seems reasonable to eliminate this threshold and use the trip count for computing interleaving count instead (#73766).
1 parent ff05c30 commit 0328528

19 files changed

+2196
-1507
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -267,11 +267,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267267
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268268
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269269

270-
static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271-
"tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272-
cl::desc("We don't interleave loops with a estimated constant trip count "
273-
"below this number"));
274-
275270
static cl::opt<unsigned> ForceTargetNumScalarRegs(
276271
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277272
cl::desc("A flag that overrides the target's number of scalar registers."));
@@ -5348,14 +5343,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
53485343

53495344
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
53505345
const bool HasReductions = !Legal->getReductionVars().empty();
5351-
// Do not interleave loops with a relatively small known or estimated trip
5352-
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
5353-
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
5354-
// because with the above conditions interleaving can expose ILP and break
5355-
// cross iteration dependences for reductions.
5356-
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5357-
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5358-
return 1;
53595346

53605347
// If we did not calculate the cost for VF (because the user selected the VF)
53615348
// then we calculate the cost of VF here.

llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ entry:
7979

8080

8181
; VECTORIZE: mul <4 x i32>
82+
; VECTORIZE: mul <4 x i32>
83+
; VECTORIZE-NOT: mul <4 x i32>
8284

8385
for.body: ; preds = %for.body, %entry
8486
%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -326,34 +326,40 @@ define void @trunc_invariant_sdiv_result(i32 %a, i32 %b, ptr noalias %src, ptr %
326326
; CHECK: vector.body:
327327
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
328328
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
329+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
329330
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
330-
; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
331-
; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
332-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
333-
; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP6]], align 2
334-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
335-
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
336-
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
331+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
332+
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
333+
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i16>
334+
; CHECK-NEXT: [[TMP7:%.*]] = mul <16 x i16> [[TMP2]], [[TMP5]]
335+
; CHECK-NEXT: [[TMP8:%.*]] = mul <16 x i16> [[TMP2]], [[TMP6]]
336+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
337+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 32
338+
; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP9]], align 2
339+
; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP10]], align 2
340+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
341+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
342+
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
337343
; CHECK: middle.block:
338344
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
339345
; CHECK: vec.epilog.iter.check:
340346
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
341347
; CHECK: vec.epilog.ph:
342-
; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[INVAR_DIV]] to i16
343-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[TMP8]], i64 0
344-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> poison, <4 x i32> zeroinitializer
348+
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[INVAR_DIV]] to i16
349+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
350+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> poison, <4 x i32> zeroinitializer
345351
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
346352
; CHECK: vec.epilog.vector.body:
347-
; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ 96, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
348-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX3]]
349-
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
350-
; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i16>
351-
; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i16> [[TMP10]], [[TMP12]]
352-
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX3]]
353-
; CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[TMP14]], align 2
354-
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX3]], 4
355-
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 100
356-
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
353+
; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ 96, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
354+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX4]]
355+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
356+
; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16>
357+
; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i16> [[TMP14]], [[TMP16]]
358+
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX4]]
359+
; CHECK-NEXT: store <4 x i16> [[TMP17]], ptr [[TMP18]], align 2
360+
; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
361+
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 100
362+
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
357363
; CHECK: vec.epilog.middle.block:
358364
; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
359365
; CHECK: vec.epilog.scalar.ph:

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
32

43
target triple = "aarch64-linux-gnu"
54

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2-
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
1+
; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
32

43
target triple = "aarch64-linux-gnu"
54

llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,25 +12,38 @@ define void @induction_i7(ptr %dst) #0 {
1212
; CHECK-LABEL: @induction_i7(
1313
; CHECK: vector.ph:
1414
; CHECK: %ind.end = trunc i64 %n.vec to i7
15-
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
16-
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
17-
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
18-
; CHECK: [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i7>
19-
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i7> [[TMP5]], zeroinitializer
20-
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i7> [[TMP6]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i64 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
21-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP7]]
15+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
16+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
17+
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
18+
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7>
19+
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i7> [[TMP7]], zeroinitializer
20+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP8]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i64 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
21+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP9]]
2222
; CHECK: vector.body:
2323
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
2424
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
25-
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
26-
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
27-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP10]]
28-
; CHECK-NEXT: [[EXT:%.+]] = zext <vscale x 2 x i7> [[TMP11]] to <vscale x 2 x i64>
29-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
30-
; CHECK-NEXT: store <vscale x 2 x i64> [[EXT]], ptr [[TMP13]], align 8
31-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
32-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]],
33-
;
25+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], [[DOTSPLAT:%.*]]
26+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
27+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
28+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
29+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0
30+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1
31+
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
32+
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
33+
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i7> [[STEP_ADD]], zeroinitializer
34+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP13]]
35+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP18]]
36+
; CHECK-NEXT: [[TMP23:%.*]] = zext <vscale x 2 x i7> [[TMP19]] to <vscale x 2 x i64>
37+
; CHECK-NEXT: [[TMP24:%.*]] = zext <vscale x 2 x i7> [[TMP20]] to <vscale x 2 x i64>
38+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
39+
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
40+
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2
41+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP27]]
42+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP23]], ptr [[TMP25]], align 8
43+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP24]], ptr [[TMP28]], align 8
44+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
45+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[STEP_ADD]], [[DOTSPLAT]]
46+
3447
entry:
3548
br label %for.body
3649

@@ -59,24 +72,34 @@ define void @induction_i3_zext(ptr %dst) #0 {
5972
; CHECK-LABEL: @induction_i3_zext(
6073
; CHECK: vector.ph:
6174
; CHECK: %ind.end = trunc i64 %n.vec to i3
62-
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
63-
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
64-
; CHECK: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
65-
; CHECK: [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i3>
66-
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i3> [[TMP5]], zeroinitializer
67-
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i3> [[TMP6]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i64 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
68-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP7]]
75+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
76+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
77+
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
78+
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
79+
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i3> [[TMP7]], zeroinitializer
80+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP8]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i64 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
6981
; CHECK: vector.body:
7082
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
7183
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
72-
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
73-
; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
74-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP9]]
75-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
76-
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP10]], ptr [[TMP13]], align 8
77-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
78-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]],
79-
;
84+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i3> [[VEC_IND]], [[DOTSPLAT]]
85+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
86+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
87+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
88+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 0
89+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 1
90+
; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], [[TMP17]]
91+
; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
92+
; CHECK-NEXT: [[TMP20:%.*]] = zext <vscale x 2 x i3> [[STEP_ADD]] to <vscale x 2 x i64>
93+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP13]]
94+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP18]]
95+
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
96+
; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
97+
; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 2
98+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP25]]
99+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP19]], ptr [[TMP23]], align 8
100+
; CHECK-NEXT: store <vscale x 2 x i64> [[TMP20]], ptr [[TMP26]], align 8
101+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
102+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[STEP_ADD]], [[DOTSPLAT]]
80103
entry:
81104
br label %for.body
82105

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
1+
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
22

33
define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
44
;CHECK: vector.body:

0 commit comments

Comments
 (0)