From a845bd89c47c02e82c2f865170edbe9984a143af Mon Sep 17 00:00:00 2001 From: adprasad Date: Tue, 18 Jun 2024 13:36:01 +0530 Subject: [PATCH 1/7] [UnJ] Move LoopUnrollAndJamPass before SLPVectorizerPass --- llvm/lib/Passes/PassBuilderPipelines.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 757b20dcd6693..9a04c1013e86c 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1317,6 +1317,11 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(BDCEPass()); } + // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first. + if (EnableUnrollAndJam && PTO.LoopUnrolling) { + FPM.addPass(createFunctionToLoopPassAdaptor( + LoopUnrollAndJamPass(Level.getSpeedupLevel()))); + } // Optimize parallel scalar instruction chains into SIMD instructions. if (PTO.SLPVectorization) { FPM.addPass(SLPVectorizerPass()); @@ -1335,11 +1340,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, // FIXME: It would be really good to use a loop-integrated instruction // combiner for cleanup here so that the unrolling and LICM can be pipelined // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) { - FPM.addPass(createFunctionToLoopPassAdaptor( - LoopUnrollAndJamPass(Level.getSpeedupLevel()))); - } FPM.addPass(LoopUnrollPass(LoopUnrollOptions( Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll))); From 74656b4bc8f51932a0e45da0668ac8844eed4b92 Mon Sep 17 00:00:00 2001 From: adprasad Date: Tue, 25 Jun 2024 15:26:56 +0530 Subject: [PATCH 2/7] [UnJ] Add comments explaining new position of UnrollAndJam --- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 9a04c1013e86c..beb4325829d74 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1245,6 +1245,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, // combiner for cleanup here so that the unrolling and LICM can be pipelined // across the loop nests. // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. if (EnableUnrollAndJam && PTO.LoopUnrolling) FPM.addPass(createFunctionToLoopPassAdaptor( LoopUnrollAndJamPass(Level.getSpeedupLevel()))); @@ -1318,6 +1319,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, } // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first. + // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. if (EnableUnrollAndJam && PTO.LoopUnrolling) { FPM.addPass(createFunctionToLoopPassAdaptor( LoopUnrollAndJamPass(Level.getSpeedupLevel()))); From 7368a69f82c94e0e8077e37fd179e2bd22ba7353 Mon Sep 17 00:00:00 2001 From: adprasad Date: Tue, 2 Jul 2024 15:03:17 +0530 Subject: [PATCH 3/7] [UnJ] Do not run UnrollAndJam twice if full LTO --- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index beb4325829d74..d5943cdc1581d 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1320,7 +1320,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first. // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. - if (EnableUnrollAndJam && PTO.LoopUnrolling) { + if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) { FPM.addPass(createFunctionToLoopPassAdaptor( LoopUnrollAndJamPass(Level.getSpeedupLevel()))); } From 721e8d7cd8b57161f9c33b2b80f469eaa0fae397 Mon Sep 17 00:00:00 2001 From: adprasad Date: Wed, 10 Jul 2024 02:31:29 +0530 Subject: [PATCH 4/7] [UnJ] Add test in Transforms/PhaseOrdering for outer loop vectorization --- .../PhaseOrdering/outer-loop-vectorize.ll | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll diff --git a/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll new file mode 100644 index 0000000000000..b27433d2997fe --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='default' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-linux-gnu" + +@aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64 +@bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64 +@cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64 +@b = dso_local global [32000 x float] zeroinitializer, align 64 +@c = dso_local global [32000 x float] zeroinitializer, align 64 +@d = dso_local global [32000 x float] zeroinitializer, align 64 +@a = dso_local global [32000 x float] zeroinitializer, align 64 +@e = dso_local global [32000 x float] zeroinitializer, align 64 +@tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64 + +; Function Attrs: nounwind uwtable vscale_range(1,16) +define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 { +; CHECK-LABEL: @s2275( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret float undef +; CHECK: vector.ph: +; CHECK-NEXT: [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3 +; CHECK-NEXT: [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]] +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]] +; CHECK-NEXT: store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]] +; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] +; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] +; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]] +; CHECK-NEXT: [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]] +; CHECK-NEXT: store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]] +; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2 +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256 +; CHECK-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: for.cond.cleanup3: +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[INC37]] = add nuw nsw i32 [[NL_056]], 1 +; CHECK-NEXT: [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000 +; CHECK-NEXT: br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.cond.cleanup7: +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]] +; CHECK-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]] +; CHECK-NEXT: store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]] +; CHECK-NEXT: [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256 +; CHECK-NEXT: br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]] +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 + %nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ] + br label %for.cond5.preheader + +for.cond.cleanup: ; preds = %for.cond.cleanup3 + ret float undef + +for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7 + %indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ] + br label %for.body8 + +for.cond.cleanup3: ; preds = %for.cond.cleanup7 + %call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2 + %inc37 = add nuw nsw i32 %nl.056, 1 + %exitcond62.not = icmp eq i32 %inc37, 39000 + br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6 + +for.cond.cleanup7: ; preds = %for.body8 + %arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58 + %0 = load float, ptr %arrayidx24, align 4, !tbaa !8 + %arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58 + %1 = load float, ptr %arrayidx26, align 4, !tbaa !8 + %arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58 + %2 = load float, ptr %arrayidx28, align 4, !tbaa !8 + %mul29 = fmul fast float %2, %1 + %add30 = fadd fast float %mul29, %0 + %arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58 + store float %add30, ptr %arrayidx32, align 4, !tbaa !8 + %indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1 + %exitcond61.not = icmp eq i64 %indvars.iv.next59, 256 + br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12 + +for.body8: ; preds = %for.cond5.preheader, %for.body8 + %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ] + %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58 + %3 = load float, ptr %arrayidx10, align 4, !tbaa !8 + %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58 + %4 = load float, ptr %arrayidx14, align 4, !tbaa !8 + %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58 + %5 = load float, ptr %arrayidx18, align 4, !tbaa !8 + %mul = fmul fast float %5, %4 + %add = fadd fast float %mul, %3 + store float %add, ptr %arrayidx10, align 4, !tbaa !8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14 +} + +declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1 + +attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" } +attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{!"clang version 19.0.0git (git@github.com:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.mustprogress"} +!8 = !{!9, !9, i64 0} +!9 = !{!"float", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C/C++ TBAA"} +!12 = distinct !{!12, !7, !13} +!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4} +!14 = distinct !{!14} From 0f8326e66534dc3a9fcd97c8d3401f8447333517 Mon Sep 17 00:00:00 2001 From: adprasad Date: Wed, 24 Jul 2024 15:53:17 +0530 Subject: [PATCH 5/7] [UnJ] Run UnJ with !IsFullLTO in same place as UnJ with IsFullLTO --- llvm/lib/Passes/PassBuilderPipelines.cpp | 32 ++++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index d5943cdc1581d..bf62747f3734a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1236,6 +1236,27 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, if (EnableInferAlignmentPass) FPM.addPass(InferAlignmentPass()); + + // Cleanup after loop vectorization. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first. + // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. + // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus + // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use. + if (EnableUnrollAndJam && PTO.LoopUnrolling) { + FPM.addPass(createFunctionToLoopPassAdaptor( + LoopUnrollAndJamPass(Level.getSpeedupLevel()))); + } + if (IsFullLTO) { // The vectorizer may have significantly shortened a loop body; unroll // again. Unroll small loops to hide loop backedge latency and saturate any @@ -1244,11 +1265,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, // FIXME: It would be really good to use a loop-integrated instruction // combiner for cleanup here so that the unrolling and LICM can be pipelined // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. - if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(createFunctionToLoopPassAdaptor( - LoopUnrollAndJamPass(Level.getSpeedupLevel()))); FPM.addPass(LoopUnrollPass(LoopUnrollOptions( Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, PTO.ForgetAllSCEVInLoopUnroll))); @@ -1318,12 +1334,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(BDCEPass()); } - // We do UnrollAndJam in a separate LPM to Unroll ensure it happens first. - // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. - if (!IsFullLTO && EnableUnrollAndJam && PTO.LoopUnrolling) { - FPM.addPass(createFunctionToLoopPassAdaptor( - LoopUnrollAndJamPass(Level.getSpeedupLevel()))); - } // Optimize parallel scalar instruction chains into SIMD instructions. if (PTO.SLPVectorization) { FPM.addPass(SLPVectorizerPass()); From 5088bacf52f667ed327018e958a01da9bf546d70 Mon Sep 17 00:00:00 2001 From: adprasad Date: Thu, 25 Jul 2024 07:18:56 -0700 Subject: [PATCH 6/7] [UnJ] [SimplifyCFG] Only run extra SimplifyCFGPass if UnJ enabled --- llvm/lib/Passes/PassBuilderPipelines.cpp | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bf62747f3734a..77743f275b1d3 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1237,22 +1237,23 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, if (EnableInferAlignmentPass) FPM.addPass(InferAlignmentPass()); - // Cleanup after loop vectorization. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchRangeToICmp(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first. // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus - // placing it immediately before the SLPVectorizerPass, presumably due to analysis re-use. + // placing it immediately before the SLPVectorizerPass, due to analysis re-use. if (EnableUnrollAndJam && PTO.LoopUnrolling) { + // Cleanup after loop vectorization. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen. + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchRangeToICmp(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + FPM.addPass(createFunctionToLoopPassAdaptor( LoopUnrollAndJamPass(Level.getSpeedupLevel()))); } From b9059f4f343fc011a1d7c96e7550da571cd47b5a Mon Sep 17 00:00:00 2001 From: adprasad Date: Fri, 26 Jul 2024 18:32:26 +0530 Subject: [PATCH 7/7] [UnJ] [SimplifyCFG] Fix comment formatting --- llvm/lib/Passes/PassBuilderPipelines.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 77743f275b1d3..a8f9a1590c150 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1238,14 +1238,17 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(InferAlignmentPass()); // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first. - // In order for outer loop vectorization to be done, UnrollAndJam must occur before the SLPVectorizerPass. - // Placing UnrollAndJam immediately after the LoopVectorizePass when !IsFullLTO leads to improved compile times versus - // placing it immediately before the SLPVectorizerPass, due to analysis re-use. + // In order for outer loop vectorization to be done, UnrollAndJam must occur + // before the SLPVectorizerPass. Placing UnrollAndJam immediately after the + // LoopVectorizePass when !IsFullLTO leads to improved compile times versus + // placing it immediately before the SLPVectorizerPass, due to analysis + // re-use. if (EnableUnrollAndJam && PTO.LoopUnrolling) { // Cleanup after loop vectorization. Simplification passes like CVP and // GVN, loop transforms, and others have already run, so it's now better to // convert to more optimized IR using more aggressive simplify CFG options. - // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP outer loop vectorization to happen. + // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP + // outer loop vectorization to happen. FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) .convertSwitchRangeToICmp(true)