Skip to content

Commit 6e3827a

Browse files
committed
[AMDGPU] Create matchPERM helper from performOrCombine PERM matching code.
Pulled out as NFC(ish) pre-commit from D159533
1 parent bd02816 commit 6e3827a

File tree

1 file changed

+85
-74
lines changed

1 file changed

+85
-74
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+85-74
Original file line numberDiff line numberDiff line change
@@ -11003,6 +11003,89 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
1100311003
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
1100411004
}
1100511005

11006+
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11007+
SelectionDAG &DAG = DCI.DAG;
11008+
EVT VT = N->getValueType(0);
11009+
11010+
if (VT != MVT::i32)
11011+
return SDValue();
11012+
11013+
// VT is known to be MVT::i32, so we need to provide 4 bytes.
11014+
SmallVector<ByteProvider<SDValue>, 8> PermNodes;
11015+
for (int i = 0; i < 4; i++) {
11016+
// Find the ByteProvider that provides the ith byte of the result of OR
11017+
std::optional<ByteProvider<SDValue>> P =
11018+
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
11019+
// TODO support constantZero
11020+
if (!P || P->isConstantZero())
11021+
return SDValue();
11022+
11023+
PermNodes.push_back(*P);
11024+
}
11025+
if (PermNodes.size() != 4)
11026+
return SDValue();
11027+
11028+
int FirstSrc = 0;
11029+
std::optional<int> SecondSrc;
11030+
uint64_t PermMask = 0x00000000;
11031+
for (size_t i = 0; i < PermNodes.size(); i++) {
11032+
auto PermOp = PermNodes[i];
11033+
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11034+
// by sizeof(Src2) = 4
11035+
int SrcByteAdjust = 4;
11036+
11037+
if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11038+
if (SecondSrc.has_value())
11039+
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11040+
return SDValue();
11041+
11042+
// Set the index of the second distinct Src node
11043+
SecondSrc = i;
11044+
assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11045+
SrcByteAdjust = 0;
11046+
}
11047+
assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11048+
assert(!DAG.getDataLayout().isBigEndian());
11049+
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11050+
}
11051+
11052+
SDValue Op = *PermNodes[FirstSrc].Src;
11053+
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11054+
: *PermNodes[FirstSrc].Src;
11055+
11056+
// Check that we are not just extracting the bytes in order from an op
11057+
if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
11058+
int Low16 = PermMask & 0xffff;
11059+
int Hi16 = (PermMask & 0xffff0000) >> 16;
11060+
11061+
bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11062+
bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11063+
11064+
// The perm op would really just produce Op. So combine into Op
11065+
if (WellFormedLow && WellFormedHi)
11066+
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11067+
}
11068+
11069+
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11070+
SDLoc DL(N);
11071+
assert(Op.getValueType().isByteSized() &&
11072+
OtherOp.getValueType().isByteSized());
11073+
11074+
// If the ultimate src is less than 32 bits, then we will only be
11075+
// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
11076+
// CalculateByteProvider would not have returned Op as source if we
11077+
// used a byte that is outside its ValueType. Thus, we are free to
11078+
// ANY_EXTEND as the extended bits are dont-cares.
11079+
Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
11080+
OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
11081+
11082+
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11083+
DAG.getConstant(PermMask, DL, MVT::i32));
11084+
}
11085+
11086+
return SDValue();
11087+
}
11088+
1100611089
SDValue SITargetLowering::performOrCombine(SDNode *N,
1100711090
DAGCombinerInfo &DCI) const {
1100811091
SelectionDAG &DAG = DCI.DAG;
@@ -11116,80 +11199,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1111611199
}
1111711200
}
1111811201
if (LHSMask == ~0u || RHSMask == ~0u) {
11119-
SmallVector<ByteProvider<SDValue>, 8> PermNodes;
11120-
11121-
// VT is known to be MVT::i32, so we need to provide 4 bytes.
11122-
assert(VT == MVT::i32);
11123-
for (int i = 0; i < 4; i++) {
11124-
// Find the ByteProvider that provides the ith byte of the result of OR
11125-
std::optional<ByteProvider<SDValue>> P =
11126-
calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
11127-
// TODO support constantZero
11128-
if (!P || P->isConstantZero())
11129-
return SDValue();
11130-
11131-
PermNodes.push_back(*P);
11132-
}
11133-
if (PermNodes.size() != 4)
11134-
return SDValue();
11135-
11136-
int FirstSrc = 0;
11137-
std::optional<int> SecondSrc;
11138-
uint64_t PermMask = 0x00000000;
11139-
for (size_t i = 0; i < PermNodes.size(); i++) {
11140-
auto PermOp = PermNodes[i];
11141-
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11142-
// by sizeof(Src2) = 4
11143-
int SrcByteAdjust = 4;
11144-
11145-
if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11146-
if (SecondSrc.has_value())
11147-
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11148-
return SDValue();
11149-
11150-
// Set the index of the second distinct Src node
11151-
SecondSrc = i;
11152-
assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11153-
SrcByteAdjust = 0;
11154-
}
11155-
assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11156-
assert(!DAG.getDataLayout().isBigEndian());
11157-
PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11158-
}
11159-
11160-
SDValue Op = *PermNodes[FirstSrc].Src;
11161-
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11162-
: *PermNodes[FirstSrc].Src;
11163-
11164-
// Check that we are not just extracting the bytes in order from an op
11165-
if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
11166-
int Low16 = PermMask & 0xffff;
11167-
int Hi16 = (PermMask & 0xffff0000) >> 16;
11168-
11169-
bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11170-
bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11171-
11172-
// The perm op would really just produce Op. So combine into Op
11173-
if (WellFormedLow && WellFormedHi)
11174-
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11175-
}
11176-
11177-
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11178-
SDLoc DL(N);
11179-
assert(Op.getValueType().isByteSized() &&
11180-
OtherOp.getValueType().isByteSized());
11181-
11182-
// If the ultimate src is less than 32 bits, then we will only be
11183-
// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
11184-
// CalculateByteProvider would not have returned Op as source if we
11185-
// used a byte that is outside its ValueType. Thus, we are free to
11186-
// ANY_EXTEND as the extended bits are dont-cares.
11187-
Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
11188-
OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
11189-
11190-
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11191-
DAG.getConstant(PermMask, DL, MVT::i32));
11192-
}
11202+
if (SDValue Perm = matchPERM(N, DCI))
11203+
return Perm;
1119311204
}
1119411205
}
1119511206

0 commit comments

Comments
 (0)