Skip to content

Commit e6eb84a

Browse files
committed
[LoopVectorize] Use DataLayout::getIndexType instead of i32 for non-constant GEP indices.
This is specifically relevant for loops that vectorize using a scalable VF, where the code results in: %vscale = call i32 llvm.vscale.i32() %vf.part1 = mul i32 %vscale, 4 %gep = getelementptr ..., i32 %vf.part1 Which InstCombine then changes into: %vscale = call i32 llvm.vscale.i32() %vf.part1 = mul i32 %vscale, 4 %vf.part1.zext = sext i32 %vf.part1 to i64 %gep = getelementptr ..., i32 %vf.part1.zext D143016 tried to remove these extends, but that only works when the call to llvm.vscale.i32() has a single use. After doing any kind of CSE on these calls the combine no longer kicks in. It seems more sensible to ask DataLayout what type to use, rather than relying on InstCombine to insert the extend and hoping it can fold it away. I've only changed this for indices that are not constant, because I vaguely remember there was a reason for sticking with i32. It would also mean patching up loads more tests. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D143267
1 parent ce6de98 commit e6eb84a

18 files changed

+349
-356
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9736,6 +9736,13 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
97369736
// Calculate the pointer for the specific unroll-part.
97379737
GetElementPtrInst *PartPtr = nullptr;
97389738

9739+
// Use i32 for the gep index type when the value is constant,
9740+
// or query DataLayout for a more suitable index type otherwise.
9741+
const DataLayout &DL =
9742+
Builder.GetInsertBlock()->getModule()->getDataLayout();
9743+
Type *IndexTy = State.VF.isScalable() && (Reverse || Part > 0)
9744+
? DL.getIndexType(ScalarDataTy->getPointerTo())
9745+
: Builder.getInt32Ty();
97399746
bool InBounds = false;
97409747
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
97419748
InBounds = gep->isInBounds();
@@ -9744,11 +9751,13 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
97449751
// wide store needs to start at the last vector element.
97459752
// RunTimeVF = VScale * VF.getKnownMinValue()
97469753
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9747-
Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9754+
Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
97489755
// NumElt = -Part * RunTimeVF
9749-
Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9756+
Value *NumElt =
9757+
Builder.CreateMul(ConstantInt::get(IndexTy, -Part), RunTimeVF);
97509758
// LastLane = 1 - RunTimeVF
9751-
Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9759+
Value *LastLane =
9760+
Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
97529761
PartPtr =
97539762
cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
97549763
PartPtr->setIsInBounds(InBounds);
@@ -9759,8 +9768,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
97599768
BlockInMaskParts[Part] =
97609769
Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
97619770
} else {
9762-
Value *Increment =
9763-
createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9771+
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
97649772
PartPtr = cast<GetElementPtrInst>(
97659773
Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
97669774
PartPtr->setIsInBounds(InBounds);

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
3434
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
3535
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
3636
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
37-
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
38-
; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2
39-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]]
37+
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
39+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
4040
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
4141
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
4242
; CHECK-NEXT: [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
@@ -72,7 +72,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
7272
; CHECK-NEXT: [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
7373
; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
7474
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
75-
; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
75+
; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
7676
; CHECK: vec.epilog.middle.block:
7777
; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
7878
; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
3434
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
3535
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
3636
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
37-
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
38-
; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 2
39-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 [[TMP14]]
37+
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
39+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
4040
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
4141
; CHECK-NEXT: [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
4242
; CHECK-NEXT: [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
@@ -71,7 +71,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
7171
; CHECK-NEXT: [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
7272
; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
7373
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
74-
; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
74+
; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
7575
; CHECK: vec.epilog.middle.block:
7676
; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP26]])
7777
; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
3333
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
3434
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
3535
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
36-
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
37-
; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4
38-
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP14]]
36+
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
37+
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
38+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]]
3939
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
4040
; CHECK-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_LOAD]])
4141
; CHECK-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP16]], <vscale x 4 x float> [[WIDE_LOAD2]])
@@ -67,7 +67,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
6767
; CHECK-NEXT: [[TMP24]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]])
6868
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
6969
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
70-
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
70+
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
7171
; CHECK: vec.epilog.middle.block:
7272
; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]]
7373
; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]

0 commit comments

Comments
 (0)