Skip to content

Commit 9c3f5fe

Browse files
committed
[LV] Don't consider the latch block as ScalarPredicatedBB.
The conditional branch from the loop latch will be replaced by a single branch controlling the loop, so there is no extra overhead from scalarization. This improves the cost esimates in some cases.
1 parent a1e9608 commit 9c3f5fe

File tree

3 files changed

+111
-22
lines changed

3 files changed

+111
-22
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6876,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
68766876
// In cases of scalarized and predicated instructions, there will be VF
68776877
// predicated blocks in the vectorized loop. Each branch around these
68786878
// blocks requires also an extract of its vector compare i1 element.
6879+
// Note that the conditional branch from the loop latch will be replaced by
6880+
// a single branch controlling the loop, so there is no extra overhead from
6881+
// scalarization.
68796882
bool ScalarPredicatedBB = false;
68806883
BranchInst *BI = cast<BranchInst>(I);
68816884
if (VF.isVector() && BI->isConditional() &&
68826885
(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6883-
PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6886+
PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
6887+
BI->getParent() != TheLoop->getLoopLatch())
68846888
ScalarPredicatedBB = true;
68856889

68866890
if (ScalarPredicatedBB) {

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -763,54 +763,86 @@ define void @latch_branch_cost(ptr %dst) {
763763
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
764764
; PRED: vector.body:
765765
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
766-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
767-
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99>
768-
; PRED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
766+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
767+
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
768+
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
769769
; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
770770
; PRED: pred.store.if:
771771
; PRED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
772772
; PRED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
773773
; PRED-NEXT: store i8 0, ptr [[TMP3]], align 1
774774
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
775775
; PRED: pred.store.continue:
776-
; PRED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
776+
; PRED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
777777
; PRED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
778778
; PRED: pred.store.if1:
779779
; PRED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
780780
; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
781781
; PRED-NEXT: store i8 0, ptr [[TMP6]], align 1
782782
; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]]
783783
; PRED: pred.store.continue2:
784-
; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
784+
; PRED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
785785
; PRED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
786786
; PRED: pred.store.if3:
787787
; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
788788
; PRED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
789789
; PRED-NEXT: store i8 0, ptr [[TMP9]], align 1
790790
; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]]
791791
; PRED: pred.store.continue4:
792-
; PRED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
793-
; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
792+
; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
793+
; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
794794
; PRED: pred.store.if5:
795795
; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
796796
; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
797797
; PRED-NEXT: store i8 0, ptr [[TMP12]], align 1
798-
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
798+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]]
799799
; PRED: pred.store.continue6:
800-
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
801-
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
802-
; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
803-
; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
800+
; PRED-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
801+
; PRED-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
802+
; PRED: pred.store.if7:
803+
; PRED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
804+
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
805+
; PRED-NEXT: store i8 0, ptr [[TMP15]], align 1
806+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]]
807+
; PRED: pred.store.continue8:
808+
; PRED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
809+
; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
810+
; PRED: pred.store.if9:
811+
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
812+
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
813+
; PRED-NEXT: store i8 0, ptr [[TMP18]], align 1
814+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE10]]
815+
; PRED: pred.store.continue10:
816+
; PRED-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
817+
; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
818+
; PRED: pred.store.if11:
819+
; PRED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
820+
; PRED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
821+
; PRED-NEXT: store i8 0, ptr [[TMP21]], align 1
822+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE12]]
823+
; PRED: pred.store.continue12:
824+
; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
825+
; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
826+
; PRED: pred.store.if13:
827+
; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
828+
; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
829+
; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1
830+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
831+
; PRED: pred.store.continue14:
832+
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
833+
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
834+
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
835+
; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
804836
; PRED: middle.block:
805837
; PRED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
806838
; PRED: scalar.ph:
807-
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
839+
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
808840
; PRED-NEXT: br label [[FOR_BODY:%.*]]
809841
; PRED: loop:
810-
; PRED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
811-
; PRED-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
812-
; PRED-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
813-
; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
842+
; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
843+
; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
844+
; PRED-NEXT: store i8 0, ptr [[GEP]], align 1
845+
; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
814846
; PRED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
815847
; PRED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
816848
; PRED: exit:

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -856,16 +856,67 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
856856
; PRED-LABEL: define void @exit_cond_zext_iv(
857857
; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
858858
; PRED-NEXT: entry:
859+
; PRED-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
860+
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
861+
; PRED: vector.scevcheck:
862+
; PRED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
863+
; PRED-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
864+
; PRED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
865+
; PRED-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
866+
; PRED-NEXT: [[TMP3:%.*]] = add i32 1, [[TMP2]]
867+
; PRED-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
868+
; PRED-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
869+
; PRED-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
870+
; PRED-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
871+
; PRED: vector.ph:
872+
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
873+
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
874+
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
875+
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
876+
; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
877+
; PRED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
878+
; PRED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
859879
; PRED-NEXT: br label [[LOOP:%.*]]
860-
; PRED: loop:
861-
; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
862-
; PRED-NEXT: [[IV_CONV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP]] ]
880+
; PRED: vector.body:
881+
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
882+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
883+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
884+
; PRED-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
885+
; PRED-NEXT: [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
886+
; PRED-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
887+
; PRED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
888+
; PRED: pred.store.if:
889+
; PRED-NEXT: [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
863890
; PRED-NEXT: [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
864891
; PRED-NEXT: store i32 0, ptr [[GEP]], align 8
892+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
893+
; PRED: pred.store.continue:
894+
; PRED-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
895+
; PRED-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
896+
; PRED: pred.store.if5:
897+
; PRED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 1
898+
; PRED-NEXT: [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
899+
; PRED-NEXT: store i32 0, ptr [[TMP13]], align 8
900+
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
901+
; PRED: pred.store.continue6:
902+
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
903+
; PRED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
904+
; PRED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
905+
; PRED: middle.block:
906+
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
907+
; PRED: scalar.ph:
908+
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
909+
; PRED-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
910+
; PRED-NEXT: br label [[LOOP1:%.*]]
911+
; PRED: loop:
912+
; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
913+
; PRED-NEXT: [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
914+
; PRED-NEXT: [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
915+
; PRED-NEXT: store i32 0, ptr [[GEP1]], align 8
865916
; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1
866917
; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
867918
; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
868-
; PRED-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
919+
; PRED-NEXT: br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
869920
; PRED: exit:
870921
; PRED-NEXT: ret void
871922
;
@@ -913,4 +964,6 @@ attributes #0 = { "target-features"="+sve" }
913964
; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
914965
; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
915966
; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
967+
; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
968+
; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
916969
;.

0 commit comments

Comments
 (0)