@@ -11003,6 +11003,89 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
11003
11003
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
11004
11004
}
11005
11005
11006
+ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11007
+ SelectionDAG &DAG = DCI.DAG;
11008
+ EVT VT = N->getValueType(0);
11009
+
11010
+ if (VT != MVT::i32)
11011
+ return SDValue();
11012
+
11013
+ // VT is known to be MVT::i32, so we need to provide 4 bytes.
11014
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
11015
+ for (int i = 0; i < 4; i++) {
11016
+ // Find the ByteProvider that provides the ith byte of the result of OR
11017
+ std::optional<ByteProvider<SDValue>> P =
11018
+ calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
11019
+ // TODO support constantZero
11020
+ if (!P || P->isConstantZero())
11021
+ return SDValue();
11022
+
11023
+ PermNodes.push_back(*P);
11024
+ }
11025
+ if (PermNodes.size() != 4)
11026
+ return SDValue();
11027
+
11028
+ int FirstSrc = 0;
11029
+ std::optional<int> SecondSrc;
11030
+ uint64_t PermMask = 0x00000000;
11031
+ for (size_t i = 0; i < PermNodes.size(); i++) {
11032
+ auto PermOp = PermNodes[i];
11033
+ // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11034
+ // by sizeof(Src2) = 4
11035
+ int SrcByteAdjust = 4;
11036
+
11037
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11038
+ if (SecondSrc.has_value())
11039
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11040
+ return SDValue();
11041
+
11042
+ // Set the index of the second distinct Src node
11043
+ SecondSrc = i;
11044
+ assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11045
+ SrcByteAdjust = 0;
11046
+ }
11047
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11048
+ assert(!DAG.getDataLayout().isBigEndian());
11049
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11050
+ }
11051
+
11052
+ SDValue Op = *PermNodes[FirstSrc].Src;
11053
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11054
+ : *PermNodes[FirstSrc].Src;
11055
+
11056
+ // Check that we are not just extracting the bytes in order from an op
11057
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
11058
+ int Low16 = PermMask & 0xffff;
11059
+ int Hi16 = (PermMask & 0xffff0000) >> 16;
11060
+
11061
+ bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11062
+ bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11063
+
11064
+ // The perm op would really just produce Op. So combine into Op
11065
+ if (WellFormedLow && WellFormedHi)
11066
+ return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11067
+ }
11068
+
11069
+ if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11070
+ SDLoc DL(N);
11071
+ assert(Op.getValueType().isByteSized() &&
11072
+ OtherOp.getValueType().isByteSized());
11073
+
11074
+ // If the ultimate src is less than 32 bits, then we will only be
11075
+ // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
11076
+ // CalculateByteProvider would not have returned Op as source if we
11077
+ // used a byte that is outside its ValueType. Thus, we are free to
11078
+ // ANY_EXTEND as the extended bits are dont-cares.
11079
+ Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
11080
+ OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
11081
+
11082
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11083
+ DAG.getConstant(PermMask, DL, MVT::i32));
11084
+ }
11085
+
11086
+ return SDValue();
11087
+ }
11088
+
11006
11089
SDValue SITargetLowering::performOrCombine(SDNode *N,
11007
11090
DAGCombinerInfo &DCI) const {
11008
11091
SelectionDAG &DAG = DCI.DAG;
@@ -11116,80 +11199,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
11116
11199
}
11117
11200
}
11118
11201
if (LHSMask == ~0u || RHSMask == ~0u) {
11119
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
11120
-
11121
- // VT is known to be MVT::i32, so we need to provide 4 bytes.
11122
- assert(VT == MVT::i32);
11123
- for (int i = 0; i < 4; i++) {
11124
- // Find the ByteProvider that provides the ith byte of the result of OR
11125
- std::optional<ByteProvider<SDValue>> P =
11126
- calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
11127
- // TODO support constantZero
11128
- if (!P || P->isConstantZero())
11129
- return SDValue();
11130
-
11131
- PermNodes.push_back(*P);
11132
- }
11133
- if (PermNodes.size() != 4)
11134
- return SDValue();
11135
-
11136
- int FirstSrc = 0;
11137
- std::optional<int> SecondSrc;
11138
- uint64_t PermMask = 0x00000000;
11139
- for (size_t i = 0; i < PermNodes.size(); i++) {
11140
- auto PermOp = PermNodes[i];
11141
- // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11142
- // by sizeof(Src2) = 4
11143
- int SrcByteAdjust = 4;
11144
-
11145
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11146
- if (SecondSrc.has_value())
11147
- if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11148
- return SDValue();
11149
-
11150
- // Set the index of the second distinct Src node
11151
- SecondSrc = i;
11152
- assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
11153
- SrcByteAdjust = 0;
11154
- }
11155
- assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11156
- assert(!DAG.getDataLayout().isBigEndian());
11157
- PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11158
- }
11159
-
11160
- SDValue Op = *PermNodes[FirstSrc].Src;
11161
- SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11162
- : *PermNodes[FirstSrc].Src;
11163
-
11164
- // Check that we are not just extracting the bytes in order from an op
11165
- if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
11166
- int Low16 = PermMask & 0xffff;
11167
- int Hi16 = (PermMask & 0xffff0000) >> 16;
11168
-
11169
- bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
11170
- bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
11171
-
11172
- // The perm op would really just produce Op. So combine into Op
11173
- if (WellFormedLow && WellFormedHi)
11174
- return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11175
- }
11176
-
11177
- if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11178
- SDLoc DL(N);
11179
- assert(Op.getValueType().isByteSized() &&
11180
- OtherOp.getValueType().isByteSized());
11181
-
11182
- // If the ultimate src is less than 32 bits, then we will only be
11183
- // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
11184
- // CalculateByteProvider would not have returned Op as source if we
11185
- // used a byte that is outside its ValueType. Thus, we are free to
11186
- // ANY_EXTEND as the extended bits are dont-cares.
11187
- Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
11188
- OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
11189
-
11190
- return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11191
- DAG.getConstant(PermMask, DL, MVT::i32));
11192
- }
11202
+ if (SDValue Perm = matchPERM(N, DCI))
11203
+ return Perm;
11193
11204
}
11194
11205
}
11195
11206
0 commit comments