-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[SLP]Support LShr as base for copyable elements #153393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Support LShr as base for copyable elements #153393
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesAdded support for LShr instructions as base for copyable elements. Also, Full diff: https://github.com/llvm/llvm-project/pull/153393.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3045eeb3eb48e..f71faa2e2a7d5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10571,27 +10571,29 @@ class InstructionsCompatibilityAnalysis {
BasicBlock *Parent = nullptr;
// Checks if the instruction has supported opcode.
auto IsSupportedOpcode = [&](Instruction *I) {
- return I && I->getOpcode() == Instruction::Add &&
+ return I &&
+ (I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::LShr) &&
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
};
// Exclude operands instructions immediately to improve compile time, it
// will be unable to schedule anyway.
SmallDenseSet<Value *, 8> Operands;
+ SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
for (Value *V : VL) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
if (!DT.isReachableFromEntry(I->getParent()))
continue;
- if (!MainOp) {
- MainOp = I;
+ if (Candidates.empty()) {
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Parent = I->getParent();
Operands.insert(I->op_begin(), I->op_end());
continue;
}
if (Parent == I->getParent()) {
- if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
- MainOp = I;
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Operands.insert(I->op_begin(), I->op_end());
continue;
}
@@ -10603,24 +10605,37 @@ class InstructionsCompatibilityAnalysis {
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
- MainOp = I;
+ Candidates.clear();
+ Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
Parent = I->getParent();
Operands.clear();
Operands.insert(I->op_begin(), I->op_end());
}
}
- if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
- MainOp = nullptr;
- return;
+ unsigned BestOpcodeNum = 0;
+ MainOp = nullptr;
+ for (const auto &P : Candidates) {
+ if (P.second.size() < BestOpcodeNum)
+ continue;
+ for (Instruction *I : P.second) {
+ if (IsSupportedOpcode(I) && !Operands.contains(I)) {
+ MainOp = I;
+ BestOpcodeNum = P.second.size();
+ break;
+ }
+ }
}
- MainOpcode = MainOp->getOpcode();
+ if (MainOp)
+ MainOpcode = MainOp->getOpcode();
}
/// Returns the idempotent value for the \p MainOp with the detected \p
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
/// the operand itself, since V or V == V.
Value *selectBestIdempotentValue() const {
- assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+ assert(
+ (MainOpcode == Instruction::Add || MainOpcode == Instruction::LShr) &&
+ "Unsupported opcode");
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
!MainOp->isCommutative());
}
@@ -10635,6 +10650,7 @@ class InstructionsCompatibilityAnalysis {
return convertTo(cast<Instruction>(V), S).second;
switch (MainOpcode) {
case Instruction::Add:
+ case Instruction::LShr:
return {V, selectBestIdempotentValue()};
default:
break;
@@ -10852,6 +10868,21 @@ class InstructionsCompatibilityAnalysis {
}
if (!Res)
return InstructionsState::invalid();
+ constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+ InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
+ InstructionCost VectorCost;
+ FixedVectorType *VecTy =
+ getWidenedType(S.getMainOp()->getType(), VL.size());
+ switch (MainOpcode) {
+ case Instruction::Add:
+ case Instruction::LShr:
+ VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ if (VectorCost > ScalarCost)
+ return InstructionsState::invalid();
return S;
}
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21064,6 +21095,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
const auto *It = find(Op, CD->getInst());
assert(It != Op.end() && "Lane not set");
+ SmallPtrSet<Instruction *, 4> Visited;
do {
int Lane = std::distance(Op.begin(), It);
assert(Lane >= 0 && "Lane not set");
@@ -21085,13 +21117,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
(InsertInReadyList && UseSD->isReady()))
WorkList.push_back(UseSD);
}
- } else if (ScheduleData *UseSD = getScheduleData(In)) {
- CD->incDependencies();
- if (!UseSD->isScheduled())
- CD->incrementUnscheduledDeps(1);
- if (!UseSD->hasValidDependencies() ||
- (InsertInReadyList && UseSD->isReady()))
- WorkList.push_back(UseSD);
+ } else if (Visited.insert(In).second) {
+ if (ScheduleData *UseSD = getScheduleData(In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
+ }
}
It = find(make_range(std::next(It), Op.end()), CD->getInst());
} while (It != Op.end());
@@ -21845,6 +21879,8 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
+ if (E.isCopyableElement(V))
+ return true;
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 8d44d03e0e5cc..6d961fc3378b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -8,11 +8,8 @@ define i32 @test(ptr %c) {
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 4f94784a24dd4..c02ef8388b066 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
; CHECK-LABEL: @PR16739_byval(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
-; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
; CHECK-NEXT: ret <4 x float> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 700e3ed9effc4..0545e5403f594 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -101,10 +101,8 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
; CHECK-LABEL: @PR16739_byval(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
-; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
; CHECK-NEXT: ret <4 x float> [[TMP5]]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
update with trunk to clear the CI error? |
Created using spr 1.3.5 [skip ci]
Created using spr 1.3.5 [skip ci]
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: llvm/llvm-project#153393
This caused a compile-time crash for zvl512b and zvl1024b RVV configurations (hopefully you got an email notification for failure on clang-riscv-rva23-zvl512b-2stage and clang-riscv-rva23-zvl1024b-2stage ?). I've confirmed this commit is the culprit via bisection. Here is an example failure on the buildbots https://lab.llvm.org/buildbot/#/builders/212/builds/335 Here is a reduced test case:
And the error:
This should have been catchable on the faster gauntlet bot but I need to add zvl512b/zvl1024b to its test matrix. |
This reverts commit ca4ebf9. Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b configurations. See here for a minimal reproducer: #153393 (comment)
I've landed a revert to get the bots green again. I'll add a zvl512b/zvl1024b config to the gauntlet bot tomorrow. |
|
We also saw a segfault building a stage2 rustc that root-caused to this, but I don't have time to try and reduce - hopefully it's related to the other report. |
This reverts commit ca4ebf9. Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b configurations. See here for a minimal reproducer: llvm/llvm-project#153393 (comment)
I can take a look |
Candidate PR: #153709 To verify, I reapplied this patch locally and confirmed that my patch could fix the issue here. |
While the DAGCombiner was wrong, I have a meta question for @alexey-bataev: is it expected for SLP to generate |
What is illegal here? It tries to emit long vectors, relying on backend (for now) on splitting. |
Reported from #153393 (comment) During DAGCombine, an intermediate extract_subvector sequence was generated: ``` t8: v9i16 = extract_subvector t3, Constant:i64<9> t24: v8i16 = extract_subvector t8, Constant:i64<0> ``` And one of the DAGCombine rule which turns `(extract_subvector (extract_subvector X, C), 0)` into `(extract_subvector X, C)` kicked in and turn that into `v8i16 = extract_subvector t3, Constant:i64<9>`. But it forgot to check if the extracted index is a multiple of the minimum vector length of the result type, hence the crash. This patch fixes this by adding an additional check.
|
… (#153709) Reported from llvm/llvm-project#153393 (comment) During DAGCombine, an intermediate extract_subvector sequence was generated: ``` t8: v9i16 = extract_subvector t3, Constant:i64<9> t24: v8i16 = extract_subvector t8, Constant:i64<0> ``` And one of the DAGCombine rule which turns `(extract_subvector (extract_subvector X, C), 0)` into `(extract_subvector X, C)` kicked in and turn that into `v8i16 = extract_subvector t3, Constant:i64<9>`. But it forgot to check if the extracted index is a multiple of the minimum vector length of the result type, hence the crash. This patch fixes this by adding an additional check.
LV is different here, at least now. There is a plan to fix it. |
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Fixed scheduling after cancellation Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: #153393
Added support for LShr instructions as base for copyable elements. Also, added simple analysis for best base instruction selection, if multiple candidates are available. Fixed scheduling after cancellation Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: llvm/llvm-project#153393
hi, this introduces crashes, even at head after the fixes:
|
This reverts commit ca4ebf9. Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b configurations. See here for a minimal reproducer: llvm#153393 (comment)
Reported from llvm#153393 (comment) During DAGCombine, an intermediate extract_subvector sequence was generated: ``` t8: v9i16 = extract_subvector t3, Constant:i64<9> t24: v8i16 = extract_subvector t8, Constant:i64<0> ``` And one of the DAGCombine rule which turns `(extract_subvector (extract_subvector X, C), 0)` into `(extract_subvector X, C)` kicked in and turn that into `v8i16 = extract_subvector t3, Constant:i64<9>`. But it forgot to check if the extracted index is a multiple of the minimum vector length of the result type, hence the crash. This patch fixes this by adding an additional check.
Checked the most recent version, cannot reproduce it |
Another crash here with this patch: Result:
I've tested against latest trunk 673750f and it still crashes there. |
Came here to say the same things as @mikaelholmen. |
this seems to have been fixed with 758c685 |
Ping @alexey-bataev This still happens at latest trunk, c6fbd12. |
Hi, sorry for the delay, don't have the access to the computer this week, will fix next week. |
Added support for LShr instructions as base for copyable elements. Also,
added simple analysis for best base instruction selection, if multiple
candidates are available.