Skip to content

Commit e0e67a6

Browse files
authored
[LV] Add initial support for vectorizing literal struct return values (#109833)
This patch adds initial support for vectorizing literal struct return values. Currently, this is limited to the case where the struct is homogeneous (all elements have the same type) and not packed. The users of the call also must all be `extractvalue` instructions. The intended use case for this is vectorizing intrinsics such as: ``` declare { float, float } @llvm.sincos.f32(float %x) ``` Mapping them to structure-returning library calls such as: ``` declare { <4 x float>, <4 x float> } @Sleef_sincosf4_u10advsimd(<4 x float>) ``` Or their widened form (such as `@llvm.sincos.v4f32` in this case). Implementing this required two main changes: 1. Supporting widening `extractvalue` 2. Adding support for vectorized struct types in LV * This is mostly limited to parts of the cost model and scalarization Since the supported use case is narrow, the required changes are relatively small.
1 parent 262e4c1 commit e0e67a6

14 files changed

+580
-103
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

+14
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,12 @@ class TargetTransformInfo {
14731473
TTI::TargetCostKind CostKind,
14741474
unsigned Index = -1) const;
14751475

1476+
/// \return The expected cost of aggregate inserts and extracts. This is
1477+
/// used when the instruction is not available; a typical use case is to
1478+
/// provision the cost of vectorization/scalarization in vectorizer passes.
1479+
InstructionCost getInsertExtractValueCost(unsigned Opcode,
1480+
TTI::TargetCostKind CostKind) const;
1481+
14761482
/// \return The cost of replication shuffle of \p VF elements typed \p EltTy
14771483
/// \p ReplicationFactor times.
14781484
///
@@ -2223,6 +2229,9 @@ class TargetTransformInfo::Concept {
22232229
const APInt &DemandedDstElts,
22242230
TTI::TargetCostKind CostKind) = 0;
22252231

2232+
virtual InstructionCost
2233+
getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
2234+
22262235
virtual InstructionCost
22272236
getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
22282237
unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -2950,6 +2959,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
29502959
return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
29512960
DemandedDstElts, CostKind);
29522961
}
2962+
InstructionCost
2963+
getInsertExtractValueCost(unsigned Opcode,
2964+
TTI::TargetCostKind CostKind) override {
2965+
return Impl.getInsertExtractValueCost(Opcode, CostKind);
2966+
}
29532967
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
29542968
unsigned AddressSpace,
29552969
TTI::TargetCostKind CostKind,

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+14-1
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,17 @@ class TargetTransformInfoImplBase {
745745
return 1;
746746
}
747747

748+
InstructionCost
749+
getInsertExtractValueCost(unsigned Opcode,
750+
TTI::TargetCostKind CostKind) const {
751+
// Note: The `insertvalue` cost here is chosen to match the default case of
752+
// getInstructionCost() -- as pior to adding this helper `insertvalue` was
753+
// not handled.
754+
if (Opcode == Instruction::InsertValue)
755+
return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
756+
return TTI::TCC_Free;
757+
}
758+
748759
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
749760
unsigned AddressSpace,
750761
TTI::TargetCostKind CostKind,
@@ -1306,9 +1317,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
13061317
case Instruction::PHI:
13071318
case Instruction::Switch:
13081319
return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
1309-
case Instruction::ExtractValue:
13101320
case Instruction::Freeze:
13111321
return TTI::TCC_Free;
1322+
case Instruction::ExtractValue:
1323+
case Instruction::InsertValue:
1324+
return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
13121325
case Instruction::Alloca:
13131326
if (cast<AllocaInst>(U)->isStaticAlloca())
13141327
return TTI::TCC_Free;

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

-10
Original file line numberDiff line numberDiff line change
@@ -416,10 +416,6 @@ class LoopVectorizationLegality {
416416
/// has a vectorized variant available.
417417
bool hasVectorCallVariants() const { return VecCallVariantsFound; }
418418

419-
/// Returns true if there is at least one function call in the loop which
420-
/// returns a struct type and needs to be vectorized.
421-
bool hasStructVectorCall() const { return StructVecCallFound; }
422-
423419
unsigned getNumStores() const { return LAI->getNumStores(); }
424420
unsigned getNumLoads() const { return LAI->getNumLoads(); }
425421

@@ -639,12 +635,6 @@ class LoopVectorizationLegality {
639635
/// the use of those function variants.
640636
bool VecCallVariantsFound = false;
641637

642-
/// If we find a call (to be vectorized) that returns a struct type, record
643-
/// that so we can bail out until this is supported.
644-
/// TODO: Remove this flag once vectorizing calls with struct returns is
645-
/// supported.
646-
bool StructVecCallFound = false;
647-
648638
/// Keep track of all the countable and uncountable exiting blocks if
649639
/// the exact backedge taken count is not computable.
650640
SmallVector<BasicBlock *, 4> CountableExitingBlocks;

llvm/lib/Analysis/TargetTransformInfo.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
11131113
return Cost;
11141114
}
11151115

1116+
InstructionCost TargetTransformInfo::getInsertExtractValueCost(
1117+
unsigned Opcode, TTI::TargetCostKind CostKind) const {
1118+
assert((Opcode == Instruction::InsertValue ||
1119+
Opcode == Instruction::ExtractValue) &&
1120+
"Expecting Opcode to be insertvalue/extractvalue.");
1121+
InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind);
1122+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1123+
return Cost;
1124+
}
1125+
11161126
InstructionCost TargetTransformInfo::getReplicationShuffleCost(
11171127
Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
11181128
TTI::TargetCostKind CostKind) const {

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

+3-10
Original file line numberDiff line numberDiff line change
@@ -954,23 +954,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
954954
if (CI && !VFDatabase::getMappings(*CI).empty())
955955
VecCallVariantsFound = true;
956956

957-
auto CanWidenInstructionTy = [this](Instruction const &Inst) {
957+
auto CanWidenInstructionTy = [](Instruction const &Inst) {
958958
Type *InstTy = Inst.getType();
959959
if (!isa<StructType>(InstTy))
960960
return canVectorizeTy(InstTy);
961961

962962
// For now, we only recognize struct values returned from calls where
963963
// all users are extractvalue as vectorizable. All element types of the
964964
// struct must be types that can be widened.
965-
if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966-
all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
967-
// TODO: Remove the `StructVecCallFound` flag once vectorizing calls
968-
// with struct returns is supported.
969-
StructVecCallFound = true;
970-
return true;
971-
}
972-
973-
return false;
965+
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966+
all_of(Inst.users(), IsaPred<ExtractValueInst>);
974967
};
975968

976969
// Check that the instruction return type is vectorizable.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+59-42
Original file line numberDiff line numberDiff line change
@@ -2390,7 +2390,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23902390
VPReplicateRecipe *RepRecipe,
23912391
const VPLane &Lane,
23922392
VPTransformState &State) {
2393-
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2393+
assert((!Instr->getType()->isAggregateType() ||
2394+
canVectorizeTy(Instr->getType())) &&
2395+
"Expected vectorizable or non-aggregate type.");
23942396

23952397
// Does this instruction return a value ?
23962398
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2900,10 +2902,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
29002902
return ScalarCallCost;
29012903
}
29022904

2903-
static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2904-
if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2905-
return Elt;
2906-
return VectorType::get(Elt, VF);
2905+
static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2906+
if (VF.isScalar() || !canVectorizeTy(Ty))
2907+
return Ty;
2908+
return toVectorizedTy(Ty, VF);
29072909
}
29082910

29092911
InstructionCost
@@ -3650,13 +3652,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
36503652
}
36513653
}
36523654

3653-
// ExtractValue instructions must be uniform, because the operands are
3654-
// known to be loop-invariant.
36553655
if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3656-
assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3657-
"Expected aggregate value to be loop invariant");
3658-
AddToWorklistIfAllowed(EVI);
3659-
continue;
3656+
if (IsOutOfScope(EVI->getAggregateOperand())) {
3657+
AddToWorklistIfAllowed(EVI);
3658+
continue;
3659+
}
3660+
// Only ExtractValue instructions where the aggregate value comes from a
3661+
// call are allowed to be non-uniform.
3662+
assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3663+
"Expected aggregate value to be call return value");
36603664
}
36613665

36623666
// If there's no pointer operand, there's nothing to do.
@@ -4526,8 +4530,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45264530
llvm_unreachable("unhandled recipe");
45274531
}
45284532

4529-
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4530-
Type *VectorTy = toVectorTy(ScalarTy, VF);
4533+
auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
45314534
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
45324535
if (!NumLegalParts)
45334536
return false;
@@ -4539,7 +4542,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45394542
// explicitly ask TTI about the register class uses for each part.
45404543
return NumLegalParts <= VF.getKnownMinValue();
45414544
}
4542-
// Two or more parts that share a register - are vectorized.
4545+
// Two or more elements that share a register - are vectorized.
45434546
return NumLegalParts < VF.getKnownMinValue();
45444547
};
45454548

@@ -4558,7 +4561,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45584561
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
45594562
if (!Visited.insert({ScalarTy}).second)
45604563
continue;
4561-
if (WillWiden(ScalarTy))
4564+
Type *WideTy = toVectorizedTy(ScalarTy, VF);
4565+
if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
45624566
return true;
45634567
}
45644568
}
@@ -5515,10 +5519,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55155519
// Compute the scalarization overhead of needed insertelement instructions
55165520
// and phi nodes.
55175521
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5518-
ScalarCost += TTI.getScalarizationOverhead(
5519-
cast<VectorType>(toVectorTy(I->getType(), VF)),
5520-
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5521-
/*Extract*/ false, CostKind);
5522+
Type *WideTy = toVectorizedTy(I->getType(), VF);
5523+
for (Type *VectorTy : getContainedTypes(WideTy)) {
5524+
ScalarCost += TTI.getScalarizationOverhead(
5525+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5526+
/*Insert=*/true,
5527+
/*Extract=*/false, CostKind);
5528+
}
55225529
ScalarCost +=
55235530
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
55245531
}
@@ -5529,15 +5536,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55295536
// overhead.
55305537
for (Use &U : I->operands())
55315538
if (auto *J = dyn_cast<Instruction>(U.get())) {
5532-
assert(VectorType::isValidElementType(J->getType()) &&
5539+
assert(canVectorizeTy(J->getType()) &&
55335540
"Instruction has non-scalar type");
55345541
if (CanBeScalarized(J))
55355542
Worklist.push_back(J);
55365543
else if (needsExtract(J, VF)) {
5537-
ScalarCost += TTI.getScalarizationOverhead(
5538-
cast<VectorType>(toVectorTy(J->getType(), VF)),
5539-
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5540-
/*Extract*/ true, CostKind);
5544+
Type *WideTy = toVectorizedTy(J->getType(), VF);
5545+
for (Type *VectorTy : getContainedTypes(WideTy)) {
5546+
ScalarCost += TTI.getScalarizationOverhead(
5547+
cast<VectorType>(VectorTy),
5548+
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5549+
/*Extract*/ true, CostKind);
5550+
}
55415551
}
55425552
}
55435553

@@ -6016,13 +6026,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
60166026
return 0;
60176027

60186028
InstructionCost Cost = 0;
6019-
Type *RetTy = toVectorTy(I->getType(), VF);
6029+
Type *RetTy = toVectorizedTy(I->getType(), VF);
60206030
if (!RetTy->isVoidTy() &&
6021-
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6022-
Cost += TTI.getScalarizationOverhead(
6023-
cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6024-
/*Insert*/ true,
6025-
/*Extract*/ false, CostKind);
6031+
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
6032+
6033+
for (Type *VectorTy : getContainedTypes(RetTy)) {
6034+
Cost += TTI.getScalarizationOverhead(
6035+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
6036+
/*Insert=*/true,
6037+
/*Extract=*/false, CostKind);
6038+
}
6039+
}
60266040

60276041
// Some targets keep addresses scalar.
60286042
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6280,9 +6294,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
62806294

62816295
bool MaskRequired = Legal->isMaskRequired(CI);
62826296
// Compute corresponding vector type for return value and arguments.
6283-
Type *RetTy = toVectorTy(ScalarRetTy, VF);
6297+
Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
62846298
for (Type *ScalarTy : ScalarTys)
6285-
Tys.push_back(toVectorTy(ScalarTy, VF));
6299+
Tys.push_back(toVectorizedTy(ScalarTy, VF));
62866300

62876301
// An in-loop reduction using an fmuladd intrinsic is a special case;
62886302
// we don't want the normal cost for that intrinsic.
@@ -6459,7 +6473,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
64596473
HasSingleCopyAfterVectorization(I, VF));
64606474
VectorTy = RetTy;
64616475
} else
6462-
VectorTy = toVectorTy(RetTy, VF);
6476+
VectorTy = toVectorizedTy(RetTy, VF);
64636477

64646478
if (VF.isVector() && VectorTy->isVectorTy() &&
64656479
!TTI.getNumberOfParts(VectorTy))
@@ -8601,7 +8615,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
86018615
case Instruction::Shl:
86028616
case Instruction::Sub:
86038617
case Instruction::Xor:
8604-
case Instruction::Freeze:
8618+
case Instruction::Freeze: {
86058619
SmallVector<VPValue *> NewOps(Operands);
86068620
if (Instruction::isBinaryOp(I->getOpcode())) {
86078621
// The legacy cost model uses SCEV to check if some of the operands are
@@ -8626,6 +8640,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
86268640
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
86278641
}
86288642
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8643+
}
8644+
case Instruction::ExtractValue: {
8645+
SmallVector<VPValue *> NewOps(Operands);
8646+
Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
8647+
auto *EVI = cast<ExtractValueInst>(I);
8648+
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
8649+
unsigned Idx = EVI->getIndices()[0];
8650+
NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
8651+
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8652+
}
86298653
};
86308654
}
86318655

@@ -9928,7 +9952,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
99289952
VectorType::get(UI->getType(), State.VF));
99299953
State.set(this, Poison);
99309954
}
9931-
State.packScalarIntoVectorValue(this, *State.Lane);
9955+
State.packScalarIntoVectorizedValue(this, *State.Lane);
99329956
}
99339957
return;
99349958
}
@@ -10445,13 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1044510469
return false;
1044610470
}
1044710471

10448-
if (LVL.hasStructVectorCall()) {
10449-
reportVectorizationFailure("Auto-vectorization of calls that return struct "
10450-
"types is not yet supported",
10451-
"StructCallVectorizationUnsupported", ORE, L);
10452-
return false;
10453-
}
10454-
1045510472
// Entrance to the VPlan-native vectorization path. Outer loops are processed
1045610473
// here. They may require CFG and instruction level transformations before
1045710474
// even evaluating whether vectorization is profitable. Since we cannot modify

llvm/lib/Transforms/Vectorize/VPlan.cpp

+19-8
Original file line numberDiff line numberDiff line change
@@ -336,10 +336,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
336336
} else {
337337
// Initialize packing with insertelements to start from undef.
338338
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
339-
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
339+
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
340340
set(Def, Undef);
341341
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
342-
packScalarIntoVectorValue(Def, Lane);
342+
packScalarIntoVectorizedValue(Def, Lane);
343343
VectorValue = get(Def);
344344
}
345345
Builder.restoreIP(OldIP);
@@ -392,13 +392,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
392392
Builder.SetCurrentDebugLocation(DIL);
393393
}
394394

395-
void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
396-
const VPLane &Lane) {
395+
void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
396+
const VPLane &Lane) {
397397
Value *ScalarInst = get(Def, Lane);
398-
Value *VectorValue = get(Def);
399-
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
400-
Lane.getAsRuntimeExpr(Builder, VF));
401-
set(Def, VectorValue);
398+
Value *WideValue = get(Def);
399+
Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
400+
if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
401+
// We must handle each element of a vectorized struct type.
402+
for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
403+
Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
404+
Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
405+
VectorValue =
406+
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
407+
WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
408+
}
409+
} else {
410+
WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
411+
}
412+
set(Def, WideValue);
402413
}
403414

404415
BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {

0 commit comments

Comments
 (0)