Skip to content

[LV] Add initial support for vectorizing literal struct return values #109833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1473,6 +1473,12 @@ class TargetTransformInfo {
TTI::TargetCostKind CostKind,
unsigned Index = -1) const;

/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
/// provision the cost of vectorization/scalarization in vectorizer passes.
InstructionCost getInsertExtractValueCost(unsigned Opcode,
TTI::TargetCostKind CostKind) const;

/// \return The cost of replication shuffle of \p VF elements typed \p EltTy
/// \p ReplicationFactor times.
///
Expand Down Expand Up @@ -2223,6 +2229,9 @@ class TargetTransformInfo::Concept {
const APInt &DemandedDstElts,
TTI::TargetCostKind CostKind) = 0;

virtual InstructionCost
getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;

virtual InstructionCost
getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace, TTI::TargetCostKind CostKind,
Expand Down Expand Up @@ -2950,6 +2959,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
DemandedDstElts, CostKind);
}
InstructionCost
getInsertExtractValueCost(unsigned Opcode,
TTI::TargetCostKind CostKind) override {
return Impl.getInsertExtractValueCost(Opcode, CostKind);
}
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
TTI::TargetCostKind CostKind,
Expand Down
15 changes: 14 additions & 1 deletion llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,17 @@ class TargetTransformInfoImplBase {
return 1;
}

InstructionCost
getInsertExtractValueCost(unsigned Opcode,
TTI::TargetCostKind CostKind) const {
// Note: The `insertvalue` cost here is chosen to match the default case of
// getInstructionCost() -- as pior to adding this helper `insertvalue` was
// not handled.
if (Opcode == Instruction::InsertValue)
return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
return TTI::TCC_Free;
}

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
TTI::TargetCostKind CostKind,
Expand Down Expand Up @@ -1306,9 +1317,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
case Instruction::PHI:
case Instruction::Switch:
return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
case Instruction::ExtractValue:
case Instruction::Freeze:
return TTI::TCC_Free;
case Instruction::ExtractValue:
case Instruction::InsertValue:
return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
case Instruction::Alloca:
if (cast<AllocaInst>(U)->isStaticAlloca())
return TTI::TCC_Free;
Expand Down
10 changes: 0 additions & 10 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -416,10 +416,6 @@ class LoopVectorizationLegality {
/// has a vectorized variant available.
bool hasVectorCallVariants() const { return VecCallVariantsFound; }

/// Returns true if there is at least one function call in the loop which
/// returns a struct type and needs to be vectorized.
bool hasStructVectorCall() const { return StructVecCallFound; }

unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }

Expand Down Expand Up @@ -639,12 +635,6 @@ class LoopVectorizationLegality {
/// the use of those function variants.
bool VecCallVariantsFound = false;

/// If we find a call (to be vectorized) that returns a struct type, record
/// that so we can bail out until this is supported.
/// TODO: Remove this flag once vectorizing calls with struct returns is
/// supported.
bool StructVecCallFound = false;

/// Keep track of all the countable and uncountable exiting blocks if
/// the exact backedge taken count is not computable.
SmallVector<BasicBlock *, 4> CountableExitingBlocks;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
return Cost;
}

InstructionCost TargetTransformInfo::getInsertExtractValueCost(
unsigned Opcode, TTI::TargetCostKind CostKind) const {
assert((Opcode == Instruction::InsertValue ||
Opcode == Instruction::ExtractValue) &&
"Expecting Opcode to be insertvalue/extractvalue.");
InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}

InstructionCost TargetTransformInfo::getReplicationShuffleCost(
Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
TTI::TargetCostKind CostKind) const {
Expand Down
13 changes: 3 additions & 10 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -954,23 +954,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (CI && !VFDatabase::getMappings(*CI).empty())
VecCallVariantsFound = true;

auto CanWidenInstructionTy = [this](Instruction const &Inst) {
auto CanWidenInstructionTy = [](Instruction const &Inst) {
Type *InstTy = Inst.getType();
if (!isa<StructType>(InstTy))
return canVectorizeTy(InstTy);

// For now, we only recognize struct values returned from calls where
// all users are extractvalue as vectorizable. All element types of the
// struct must be types that can be widened.
if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
// TODO: Remove the `StructVecCallFound` flag once vectorizing calls
// with struct returns is supported.
StructVecCallFound = true;
return true;
}

return false;
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
all_of(Inst.users(), IsaPred<ExtractValueInst>);
};

// Check that the instruction return type is vectorizable.
Expand Down
101 changes: 59 additions & 42 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2389,7 +2389,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPLane &Lane,
VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
assert((!Instr->getType()->isAggregateType() ||
canVectorizeTy(Instr->getType())) &&
"Expected vectorizable or non-aggregate type.");

// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Expand Down Expand Up @@ -2894,10 +2896,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
return ScalarCallCost;
}

static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
return Elt;
return VectorType::get(Elt, VF);
static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
if (VF.isScalar() || !canVectorizeTy(Ty))
return Ty;
return toVectorizedTy(Ty, VF);
}

InstructionCost
Expand Down Expand Up @@ -3644,13 +3646,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
}
}

// ExtractValue instructions must be uniform, because the operands are
// known to be loop-invariant.
if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
assert(IsOutOfScope(EVI->getAggregateOperand()) &&
"Expected aggregate value to be loop invariant");
AddToWorklistIfAllowed(EVI);
continue;
if (IsOutOfScope(EVI->getAggregateOperand())) {
AddToWorklistIfAllowed(EVI);
continue;
}
// Only ExtractValue instructions where the aggregate value comes from a
// call are allowed to be non-uniform.
assert(isa<CallInst>(EVI->getAggregateOperand()) &&
"Expected aggregate value to be call return value");
}

// If there's no pointer operand, there's nothing to do.
Expand Down Expand Up @@ -4513,8 +4517,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
llvm_unreachable("unhandled recipe");
}

auto WillWiden = [&TTI, VF](Type *ScalarTy) {
Type *VectorTy = toVectorTy(ScalarTy, VF);
auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
if (!NumLegalParts)
return false;
Expand All @@ -4526,7 +4529,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
// explicitly ask TTI about the register class uses for each part.
return NumLegalParts <= VF.getKnownMinValue();
}
// Two or more parts that share a register - are vectorized.
// Two or more elements that share a register - are vectorized.
return NumLegalParts < VF.getKnownMinValue();
};

Expand All @@ -4545,7 +4548,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
if (!Visited.insert({ScalarTy}).second)
continue;
if (WillWiden(ScalarTy))
Type *WideTy = toVectorizedTy(ScalarTy, VF);
if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
return true;
}
}
Expand Down Expand Up @@ -5503,10 +5507,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
/*Extract*/ false, CostKind);
Type *WideTy = toVectorizedTy(I->getType(), VF);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be good to have some cost model tests showing what happens in different scenarios when you have call instructions in a loop that return a struct. In particular, it would be good to have tests that exercise this code path.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking if this has been addressed in the latest version?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there's some cost model tests and a test case crafted to hit this specific code path too (@scalarized_predicated_struct_return).

for (Type *VectorTy : getContainedTypes(WideTy)) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/true,
/*Extract=*/false, CostKind);
}
ScalarCost +=
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
}
Expand All @@ -5517,15 +5524,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// overhead.
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get())) {
assert(VectorType::isValidElementType(J->getType()) &&
assert(canVectorizeTy(J->getType()) &&
"Instruction has non-scalar type");
if (CanBeScalarized(J))
Worklist.push_back(J);
else if (needsExtract(J, VF)) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(J->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
/*Extract*/ true, CostKind);
Type *WideTy = toVectorizedTy(J->getType(), VF);
for (Type *VectorTy : getContainedTypes(WideTy)) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(VectorTy),
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
/*Extract*/ true, CostKind);
}
}
}

Expand Down Expand Up @@ -6004,13 +6014,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
return 0;

InstructionCost Cost = 0;
Type *RetTy = toVectorTy(I->getType(), VF);
Type *RetTy = toVectorizedTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
/*Insert*/ true,
/*Extract*/ false, CostKind);
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {

for (Type *VectorTy : getContainedTypes(RetTy)) {
Cost += TTI.getScalarizationOverhead(
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
/*Insert=*/true,
/*Extract=*/false, CostKind);
}
}

// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
Expand Down Expand Up @@ -6268,9 +6282,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {

bool MaskRequired = Legal->isMaskRequired(CI);
// Compute corresponding vector type for return value and arguments.
Type *RetTy = toVectorTy(ScalarRetTy, VF);
Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
Tys.push_back(toVectorTy(ScalarTy, VF));
Tys.push_back(toVectorizedTy(ScalarTy, VF));

// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
Expand Down Expand Up @@ -6460,7 +6474,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
HasSingleCopyAfterVectorization(I, VF));
VectorTy = RetTy;
} else
VectorTy = toVectorTy(RetTy, VF);
VectorTy = toVectorizedTy(RetTy, VF);

if (VF.isVector() && VectorTy->isVectorTy() &&
!TTI.getNumberOfParts(VectorTy))
Expand Down Expand Up @@ -8582,7 +8596,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
case Instruction::Shl:
case Instruction::Sub:
case Instruction::Xor:
case Instruction::Freeze:
case Instruction::Freeze: {
SmallVector<VPValue *> NewOps(Operands);
if (Instruction::isBinaryOp(I->getOpcode())) {
// The legacy cost model uses SCEV to check if some of the operands are
Expand All @@ -8607,6 +8621,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
}
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
}
case Instruction::ExtractValue: {
SmallVector<VPValue *> NewOps(Operands);
Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
auto *EVI = cast<ExtractValueInst>(I);
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
unsigned Idx = EVI->getIndices()[0];
NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
}
};
}

Expand Down Expand Up @@ -9888,7 +9912,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
VectorType::get(UI->getType(), State.VF));
State.set(this, Poison);
}
State.packScalarIntoVectorValue(this, *State.Lane);
State.packScalarIntoVectorizedValue(this, *State.Lane);
}
return;
}
Expand Down Expand Up @@ -10405,13 +10429,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}

if (LVL.hasStructVectorCall()) {
reportVectorizationFailure("Auto-vectorization of calls that return struct "
"types is not yet supported",
"StructCallVectorizationUnsupported", ORE, L);
return false;
}

// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
Expand Down
27 changes: 19 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
} else {
// Initialize packing with insertelements to start from undef.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want, you could potentially pull out some of these changes into a NFC patch. For example, toVectorTy -> toVectorizedTy and VectorType::get -> toVectorizedTy, etc. Although if you don't think it's worth it I'm happy as it is.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think spilling those out would make sense. Within this patch, all of these changes are needed for the core functionality to work (switching the call does change behavior). If I landed these as an NFC, it makes it less obvious that these are required changes for the struct vectorization.

set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(Def, Lane);
packScalarIntoVectorizedValue(Def, Lane);
VectorValue = get(Def);
}
Builder.restoreIP(OldIP);
Expand Down Expand Up @@ -392,13 +392,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
Builder.SetCurrentDebugLocation(DIL);
}

void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
const VPLane &Lane) {
void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
const VPLane &Lane) {
Value *ScalarInst = get(Def, Lane);
Value *VectorValue = get(Def);
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
Lane.getAsRuntimeExpr(Builder, VF));
set(Def, VectorValue);
Value *WideValue = get(Def);
Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
// We must handle each element of a vectorized struct type.
for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
VectorValue =
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
}
} else {
WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
}
set(Def, WideValue);
}

BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) {
Expand Down
Loading