-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Swap UnrollAndJam Pass to before the SLP Vectorizer Pass #97029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
adprasad-nvidia
wants to merge
7
commits into
llvm:main
Choose a base branch
from
adprasad-nvidia:adprasad/swap-unj-before-slp
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+199
−9
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
a845bd8
[UnJ] Move LoopUnrollAndJamPass before SLPVectorizerPass
adprasad-nvidia 74656b4
[UnJ] Add comments explaining new position of UnrollAndJam
adprasad-nvidia 7368a69
[UnJ] Do not run UnrollAndJam twice if full LTO
adprasad-nvidia 721e8d7
[UnJ] Add test in Transforms/PhaseOrdering for outer loop vectorization
adprasad-nvidia 0f8326e
[UnJ] Run UnJ with !IsFullLTO in same place as UnJ with IsFullLTO
adprasad-nvidia 5088bac
[UnJ] [SimplifyCFG] Only run extra SimplifyCFGPass if UnJ enabled
adprasad-nvidia b9059f4
[UnJ] [SimplifyCFG] Fix comment formatting
adprasad-nvidia File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
174 changes: 174 additions & 0 deletions
174
llvm/test/Transforms/PhaseOrdering/outer-loop-vectorize.ll
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | ||
; RUN: opt -passes='default<O3>' -enable-unroll-and-jam -allow-unroll-and-jam -S < %s | FileCheck %s | ||
|
||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" | ||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
@aa = dso_local global [256 x [256 x float]] zeroinitializer, align 64 | ||
@bb = dso_local global [256 x [256 x float]] zeroinitializer, align 64 | ||
@cc = dso_local global [256 x [256 x float]] zeroinitializer, align 64 | ||
@b = dso_local global [32000 x float] zeroinitializer, align 64 | ||
@c = dso_local global [32000 x float] zeroinitializer, align 64 | ||
@d = dso_local global [32000 x float] zeroinitializer, align 64 | ||
@a = dso_local global [32000 x float] zeroinitializer, align 64 | ||
@e = dso_local global [32000 x float] zeroinitializer, align 64 | ||
@tt = dso_local local_unnamed_addr global [256 x [256 x float]] zeroinitializer, align 64 | ||
|
||
; Function Attrs: nounwind uwtable vscale_range(1,16) | ||
define dso_local nofpclass(nan inf) float @s2275(ptr nocapture noundef readnone %func_args) local_unnamed_addr #0 { | ||
; CHECK-LABEL: @s2275( | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] | ||
; CHECK: for.cond1.preheader: | ||
; CHECK-NEXT: [[NL_056:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] | ||
; CHECK-NEXT: br label [[VECTOR_PH:%.*]] | ||
; CHECK: for.cond.cleanup: | ||
; CHECK-NEXT: ret float undef | ||
; CHECK: vector.ph: | ||
; CHECK-NEXT: [[INDVARS_IV58:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT59_3:%.*]], [[FOR_COND_CLEANUP7:%.*]] ] | ||
; CHECK-NEXT: [[INDVARS_IV_NEXT59_1:%.*]] = or disjoint i64 [[INDVARS_IV58]], 2 | ||
; CHECK-NEXT: [[INDVARS_IV_NEXT59_2:%.*]] = or disjoint i64 [[INDVARS_IV58]], 3 | ||
; CHECK-NEXT: [[INDVARS_IV_NEXT59_3]] = add nuw nsw i64 [[INDVARS_IV58]], 4 | ||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] | ||
; CHECK: vector.body: | ||
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] | ||
; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1 | ||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[INDEX]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP0]], i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 2 | ||
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP2]], align 16, !tbaa [[TBAA6:![0-9]+]] | ||
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[TMP4]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr [[TMP6]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP9]], [[TMP8]] | ||
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[TMP10]], [[TMP7]] | ||
; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[TMP2]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[INDEX_2]], 1 | ||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] | ||
; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 8, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] | ||
; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 8, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP12]], i64 [[INDVARS_IV_NEXT59_1]] | ||
; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 8, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP19:%.*]] = fmul fast float [[TMP18]], [[TMP16]] | ||
; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]] | ||
; CHECK-NEXT: store float [[TMP20]], ptr [[TMP13]], align 8, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX_2]], 2 | ||
; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX_3]], 1 | ||
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] | ||
; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] | ||
; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 [[TMP21]], i64 [[INDVARS_IV_NEXT59_2]] | ||
; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP27]], [[TMP25]] | ||
; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP23]] | ||
; CHECK-NEXT: [[TMP30:%.*]] = load <4 x float>, ptr [[TMP1]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[TMP3]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP32:%.*]] = load <4 x float>, ptr [[TMP5]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP32]], [[TMP31]] | ||
; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <4 x float> [[TMP33]], [[TMP30]] | ||
; CHECK-NEXT: store <4 x float> [[TMP34]], ptr [[TMP1]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: store float [[TMP29]], ptr [[TMP22]], align 4, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX_3]], 2 | ||
; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 256 | ||
; CHECK-NEXT: br i1 [[TMP35]], label [[FOR_COND_CLEANUP7]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] | ||
; CHECK: for.cond.cleanup3: | ||
; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #[[ATTR2:[0-9]+]] | ||
; CHECK-NEXT: [[INC37]] = add nuw nsw i32 [[NL_056]], 1 | ||
; CHECK-NEXT: [[EXITCOND62_NOT:%.*]] = icmp eq i32 [[INC37]], 39000 | ||
; CHECK-NEXT: br i1 [[EXITCOND62_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP13:![0-9]+]] | ||
; CHECK: for.cond.cleanup7: | ||
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV58]] | ||
; CHECK-NEXT: [[TMP36:%.*]] = load <4 x float>, ptr [[ARRAYIDX24]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP37:%.*]] = load <4 x float>, ptr [[ARRAYIDX26]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP38:%.*]] = load <4 x float>, ptr [[ARRAYIDX28]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <4 x float> [[TMP38]], [[TMP37]] | ||
; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <4 x float> [[TMP39]], [[TMP36]] | ||
; CHECK-NEXT: store <4 x float> [[TMP40]], ptr [[ARRAYIDX32]], align 16, !tbaa [[TBAA6]] | ||
; CHECK-NEXT: [[EXITCOND61_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT59_3]], 256 | ||
; CHECK-NEXT: br i1 [[EXITCOND61_NOT_3]], label [[FOR_COND_CLEANUP3]], label [[VECTOR_PH]], !llvm.loop [[LOOP15:![0-9]+]] | ||
; | ||
entry: | ||
br label %for.cond1.preheader | ||
|
||
for.cond1.preheader: ; preds = %entry, %for.cond.cleanup3 | ||
%nl.056 = phi i32 [ 0, %entry ], [ %inc37, %for.cond.cleanup3 ] | ||
br label %for.cond5.preheader | ||
|
||
for.cond.cleanup: ; preds = %for.cond.cleanup3 | ||
ret float undef | ||
|
||
for.cond5.preheader: ; preds = %for.cond1.preheader, %for.cond.cleanup7 | ||
%indvars.iv58 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next59, %for.cond.cleanup7 ] | ||
br label %for.body8 | ||
|
||
for.cond.cleanup3: ; preds = %for.cond.cleanup7 | ||
%call = tail call i32 @dummy(ptr noundef nonnull @a, ptr noundef nonnull @b, ptr noundef nonnull @c, ptr noundef nonnull @d, ptr noundef nonnull @e, ptr noundef nonnull @aa, ptr noundef nonnull @bb, ptr noundef nonnull @cc, float noundef nofpclass(nan inf) 0.000000e+00) #2 | ||
%inc37 = add nuw nsw i32 %nl.056, 1 | ||
%exitcond62.not = icmp eq i32 %inc37, 39000 | ||
br i1 %exitcond62.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !6 | ||
|
||
for.cond.cleanup7: ; preds = %for.body8 | ||
%arrayidx24 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv58 | ||
%0 = load float, ptr %arrayidx24, align 4, !tbaa !8 | ||
%arrayidx26 = getelementptr inbounds [32000 x float], ptr @c, i64 0, i64 %indvars.iv58 | ||
%1 = load float, ptr %arrayidx26, align 4, !tbaa !8 | ||
%arrayidx28 = getelementptr inbounds [32000 x float], ptr @d, i64 0, i64 %indvars.iv58 | ||
%2 = load float, ptr %arrayidx28, align 4, !tbaa !8 | ||
%mul29 = fmul fast float %2, %1 | ||
%add30 = fadd fast float %mul29, %0 | ||
%arrayidx32 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv58 | ||
store float %add30, ptr %arrayidx32, align 4, !tbaa !8 | ||
%indvars.iv.next59 = add nuw nsw i64 %indvars.iv58, 1 | ||
%exitcond61.not = icmp eq i64 %indvars.iv.next59, 256 | ||
br i1 %exitcond61.not, label %for.cond.cleanup3, label %for.cond5.preheader, !llvm.loop !12 | ||
|
||
for.body8: ; preds = %for.cond5.preheader, %for.body8 | ||
%indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ] | ||
%arrayidx10 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv, i64 %indvars.iv58 | ||
%3 = load float, ptr %arrayidx10, align 4, !tbaa !8 | ||
%arrayidx14 = getelementptr inbounds [256 x [256 x float]], ptr @bb, i64 0, i64 %indvars.iv, i64 %indvars.iv58 | ||
%4 = load float, ptr %arrayidx14, align 4, !tbaa !8 | ||
%arrayidx18 = getelementptr inbounds [256 x [256 x float]], ptr @cc, i64 0, i64 %indvars.iv, i64 %indvars.iv58 | ||
%5 = load float, ptr %arrayidx18, align 4, !tbaa !8 | ||
%mul = fmul fast float %5, %4 | ||
%add = fadd fast float %mul, %3 | ||
store float %add, ptr %arrayidx10, align 4, !tbaa !8 | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond.not = icmp eq i64 %indvars.iv.next, 256 | ||
br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8, !llvm.loop !14 | ||
} | ||
|
||
declare i32 @dummy(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef, float noundef nofpclass(nan inf)) local_unnamed_addr #1 | ||
|
||
attributes #0 = { nounwind uwtable vscale_range(1,16) "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" } | ||
attributes #1 = { "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v2" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+outline-atomics,+pauth,+rand,+ras,+rcpc,+rdm,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" "unsafe-fp-math"="true" } | ||
attributes #2 = { nounwind } | ||
|
||
!llvm.module.flags = !{!0, !1, !2, !3, !4} | ||
!llvm.ident = !{!5} | ||
|
||
!0 = !{i32 1, !"wchar_size", i32 4} | ||
!1 = !{i32 8, !"PIC Level", i32 2} | ||
!2 = !{i32 7, !"PIE Level", i32 2} | ||
!3 = !{i32 7, !"uwtable", i32 2} | ||
!4 = !{i32 7, !"frame-pointer", i32 1} | ||
!5 = !{!"clang version 19.0.0git ([email protected]:sjoerdmeijer/llvm-project.git 6efcff18dfc42038bafa67091e990b9c1b839a71)"} | ||
!6 = distinct !{!6, !7} | ||
!7 = !{!"llvm.loop.mustprogress"} | ||
!8 = !{!9, !9, i64 0} | ||
!9 = !{!"float", !10, i64 0} | ||
!10 = !{!"omnipotent char", !11, i64 0} | ||
!11 = !{!"Simple C/C++ TBAA"} | ||
!12 = distinct !{!12, !7, !13} | ||
!13 = !{!"llvm.loop.unroll_and_jam.count", i32 4} | ||
!14 = distinct !{!14} | ||
Comment on lines
+153
to
+174
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can simplify the test by getting rid of unnecessary attributes and metadata |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove triple