Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,31 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,

if (EnableInferAlignmentPass)
FPM.addPass(InferAlignmentPass());

// We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
// In order for outer loop vectorization to be done, UnrollAndJam must occur
// before the SLPVectorizerPass. Placing UnrollAndJam immediately after the
// LoopVectorizePass when !IsFullLTO leads to improved compile times versus
// placing it immediately before the SLPVectorizerPass, due to analysis
// re-use.
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
// Cleanup after loop vectorization. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP
// outer loop vectorization to happen.
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchRangeToICmp(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));

FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
}

if (IsFullLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
Expand All @@ -1244,10 +1269,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && PTO.LoopUnrolling)
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
Expand Down Expand Up @@ -1335,11 +1356,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
FPM.addPass(createFunctionToLoopPassAdaptor(
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
}
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
Expand Down
174 changes: 174 additions & 0 deletions llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-linux-gnu"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove triple


@aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
@bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
@cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64
@b = dso_local global [32000 x float] zeroinitializer, align 64
@c = dso_local global [32000 x float] zeroinitializer, align 64
@d = dso_local global [32000 x float] zeroinitializer, align 64
@a = dso_local global [32000 x float] zeroinitializer, align 64
@e = dso_local global [32000 x float] zeroinitializer, align 64
@tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64

; Function Attrs: nounwind uwtable vscale_range(1,16)
define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 {
; CHECK-LABEL: @s2275(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
; CHECK: for.cond1.preheader:
; CHECK-NEXT: [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret float undef
; CHECK: vector.ph:
; CHECK-NEXT: [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2
; CHECK-NEXT: [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3
; CHECK-NEXT: [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]]
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]]
; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]]
; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]]
; CHECK-NEXT: store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2
; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]]
; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]]
; CHECK-NEXT: [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]]
; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]]
; CHECK-NEXT: store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2
; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256
; CHECK-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: for.cond.cleanup3:
; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: [[INC37]] = add nuw nsw i32 [[NL_056]], 1
; CHECK-NEXT: [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000
; CHECK-NEXT: br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: for.cond.cleanup7:
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]]
; CHECK-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]]
; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]]
; CHECK-NEXT: store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]]
; CHECK-NEXT: [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256
; CHECK-NEXT: br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]]
;
entry:
br label %for.cond1.preheader

for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3
%nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ]
br label %for.cond5.preheader

for.cond.cleanup: ; preds = %for.cond.cleanup3
ret float undef

for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7
%indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ]
br label %for.body8

for.cond.cleanup3: ; preds = %for.cond.cleanup7
%call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2
%inc37 = add nuw nsw i32 %nl.056, 1
%exitcond62.not = icmp eq i32 %inc37, 39000
br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6

for.cond.cleanup7: ; preds = %for.body8
%arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58
%0 = load float, ptr %arrayidx24, align 4, !tbaa !8
%arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58
%1 = load float, ptr %arrayidx26, align 4, !tbaa !8
%arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58
%2 = load float, ptr %arrayidx28, align 4, !tbaa !8
%mul29 = fmul fast float %2, %1
%add30 = fadd fast float %mul29, %0
%arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58
store float %add30, ptr %arrayidx32, align 4, !tbaa !8
%indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1
%exitcond61.not = icmp eq i64 %indvars.iv.next59, 256
br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12

for.body8: ; preds = %for.cond5.preheader, %for.body8
%indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
%arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58
%3 = load float, ptr %arrayidx10, align 4, !tbaa !8
%arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58
%4 = load float, ptr %arrayidx14, align 4, !tbaa !8
%arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58
%5 = load float, ptr %arrayidx18, align 4, !tbaa !8
%mul = fmul fast float %5, %4
%add = fadd fast float %mul, %3
store float %add, ptr %arrayidx10, align 4, !tbaa !8
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 256
br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14
}

declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1

attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
attributes #2 = { nounwind }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 1}
!5 = !{!"clang version 19.0.0git ([email protected]:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!9, !9, i64 0}
!9 = !{!"float", !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C/C++ TBAA"}
!12 = distinct !{!12, !7, !13}
!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
!14 = distinct !{!14}
Comment on lines +153 to +174
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can simplify the test by getting rid of unnecessary attributes and metadata