llvm · adprasad-nvidia · Jun 18, 2024 · Jun 25, 2024 · Jul 2, 2024 · Jul 9, 2024
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1236,6 +1236,31 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
 
   if (EnableInferAlignmentPass)
     FPM.addPass(InferAlignmentPass());
+
+  // We do UnrollAndJam in a separate LPM to Unroll to ensure it happens first.
+  // In order for outer loop vectorization to be done, UnrollAndJam must occur
+  // before the SLPVectorizerPass. Placing UnrollAndJam immediately after the
+  // LoopVectorizePass when !IsFullLTO leads to improved compile times versus
+  // placing it immediately before the SLPVectorizerPass, due to analysis
+  // re-use.
+  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
+    // Cleanup after loop vectorization. Simplification passes like CVP and
+    // GVN, loop transforms, and others have already run, so it's now better to
+    // convert to more optimized IR using more aggressive simplify CFG options.
+    // SimplifyCFGPass must be run before UnrollAndJam for UnrollAndJam-SLP
+    // outer loop vectorization to happen.
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                    .forwardSwitchCondToPhi(true)
+                                    .convertSwitchRangeToICmp(true)
+                                    .convertSwitchToLookupTable(true)
+                                    .needCanonicalLoops(false)
+                                    .hoistCommonInsts(true)
+                                    .sinkCommonInsts(true)));
+
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LoopUnrollAndJamPass(Level.getSpeedupLevel())));
+  }
+
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
     // again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1244,10 +1269,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // FIXME: It would be really good to use a loop-integrated instruction
     // combiner for cleanup here so that the unrolling and LICM can be pipelined
     // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling)
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
     FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
         Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
         PTO.ForgetAllSCEVInLoopUnroll)));
@@ -1335,11 +1356,6 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     // FIXME: It would be really good to use a loop-integrated instruction
     // combiner for cleanup here so that the unrolling and LICM can be pipelined
     // across the loop nests.
-    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
-    if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-      FPM.addPass(createFunctionToLoopPassAdaptor(
-          LoopUnrollAndJamPass(Level.getSpeedupLevel())));
-    }
     FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
         Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
         PTO.ForgetAllSCEVInLoopUnroll)));

diff --git a/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll b/llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+@aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+@bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+@cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64
+@b = dso_local global [32000 x float] zeroinitializer, align 64
+@c = dso_local global [32000 x float] zeroinitializer, align 64
+@d = dso_local global [32000 x float] zeroinitializer, align 64
+@a = dso_local global [32000 x float] zeroinitializer, align 64
+@e = dso_local global [32000 x float] zeroinitializer, align 64
+@tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 {
+; CHECK-LABEL: @s2275(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret float undef
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]]
+; CHECK-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[INC37]] = add nuw nsw i32 [[NL_056]], 1
+; CHECK-NEXT:    [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000
+; CHECK-NEXT:    br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.cond.cleanup7:
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]]
+; CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.cond.cleanup3
+  %nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret float undef
+
+for.cond5.preheader:                              ; preds = %for.cond1.preheader, %for.cond.cleanup7
+  %indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ]
+  br label %for.body8
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7
+  %call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2
+  %inc37 = add nuw nsw i32 %nl.056, 1
+  %exitcond62.not = icmp eq i32 %inc37, 39000
+  br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58
+  %0 = load float, ptr %arrayidx24, align 4, !tbaa !8
+  %arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58
+  %1 = load float, ptr %arrayidx26, align 4, !tbaa !8
+  %arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58
+  %2 = load float, ptr %arrayidx28, align 4, !tbaa !8
+  %mul29 = fmul fast float %2, %1
+  %add30 = fadd fast float %mul29, %0
+  %arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58
+  store float %add30, ptr %arrayidx32, align 4, !tbaa !8
+  %indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1
+  %exitcond61.not = icmp eq i64 %indvars.iv.next59, 256
+  br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12
+
+for.body8:                                        ; preds = %for.cond5.preheader, %for.body8
+  %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %3 = load float, ptr %arrayidx10, align 4, !tbaa !8
+  %arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %4 = load float, ptr %arrayidx14, align 4, !tbaa !8
+  %arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58
+  %5 = load float, ptr %arrayidx18, align 4, !tbaa !8
+  %mul = fmul fast float %5, %4
+  %add = fadd fast float %mul, %3
+  store float %add, ptr %arrayidx10, align 4, !tbaa !8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14
+}
+
+declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!"clang version 19.0.0git ([email protected]:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
+!12 = distinct !{!12, !7, !13}
+!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4}
+!14 = distinct !{!14}