Skip to content
This repository was archived by the owner on Sep 2, 2018. It is now read-only.

Commit 78c67eb

Browse files
Jun Bum LimDylan McKay
Jun Bum Lim
authored and
Dylan McKay
committed
Improve ISel across lane float min/max reduction
In vectorized float min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : svn0 = vector_shuffle t0, undef<2,3,u,u> fmin = fminnum t0,svn0 svn1 = vector_shuffle fmin, undef<1,u,u,u> cc = setcc fmin, svn1, ole n0 = extract_vector_elt cc, #0 n1 = extract_vector_elt fmin, #0 n2 = extract_vector_elt fmin, #1 result = select n0, n1,n2 into : result = llvm.aarch64.neon.fminnmv t0 This change extends r247575. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@249834 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 4fb3e55 commit 78c67eb

File tree

2 files changed

+79
-12
lines changed

2 files changed

+79
-12
lines changed

lib/Target/AArch64/AArch64ISelLowering.cpp

+47-12
Original file line numberDiff line numberDiff line change
@@ -8750,8 +8750,13 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
87508750
return SDValue();
87518751

87528752
int NumVecElts = VTy.getVectorNumElements();
8753-
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
8754-
return SDValue();
8753+
if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
8754+
if (NumVecElts != 4)
8755+
return SDValue();
8756+
} else {
8757+
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
8758+
return SDValue();
8759+
}
87558760

87568761
int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
87578762
SDValue PreOp = OpV;
@@ -8802,6 +8807,8 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
88028807
PreOp = CurOp;
88038808
}
88048809
unsigned Opcode;
8810+
bool IsIntrinsic = false;
8811+
88058812
switch (Op) {
88068813
default:
88078814
llvm_unreachable("Unexpected operator for across vector reduction");
@@ -8820,11 +8827,24 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
88208827
case ISD::UMIN:
88218828
Opcode = AArch64ISD::UMINV;
88228829
break;
8830+
case ISD::FMAXNUM:
8831+
Opcode = Intrinsic::aarch64_neon_fmaxnmv;
8832+
IsIntrinsic = true;
8833+
break;
8834+
case ISD::FMINNUM:
8835+
Opcode = Intrinsic::aarch64_neon_fminnmv;
8836+
IsIntrinsic = true;
8837+
break;
88238838
}
88248839
SDLoc DL(N);
8825-
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
8826-
DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
8827-
DAG.getConstant(0, DL, MVT::i64));
8840+
8841+
return IsIntrinsic
8842+
? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
8843+
DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
8844+
: DAG.getNode(
8845+
ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
8846+
DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
8847+
DAG.getConstant(0, DL, MVT::i64));
88288848
}
88298849

88308850
/// Target-specific DAG combine for the across vector min/max reductions.
@@ -8848,9 +8868,6 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
88488868
/// becomes :
88498869
/// %1 = smaxv %0
88508870
/// %result = extract_vector_elt %1, 0
8851-
/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
8852-
/// We could also support other types of across lane reduction available
8853-
/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
88548871
static SDValue
88558872
performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
88568873
const AArch64Subtarget *Subtarget) {
@@ -8878,17 +8895,26 @@ performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
88788895
SDValue VectorOp = SetCC.getOperand(0);
88798896
unsigned Op = VectorOp->getOpcode();
88808897
// Check if the input vector is fed by the operator we want to handle.
8881-
if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
8898+
if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
8899+
Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
88828900
return SDValue();
88838901

88848902
EVT VTy = VectorOp.getValueType();
88858903
if (!VTy.isVector())
88868904
return SDValue();
88878905

8888-
EVT EltTy = VTy.getVectorElementType();
8889-
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8906+
if (VTy.getSizeInBits() < 64)
88908907
return SDValue();
88918908

8909+
EVT EltTy = VTy.getVectorElementType();
8910+
if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
8911+
if (EltTy != MVT::f32)
8912+
return SDValue();
8913+
} else {
8914+
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8915+
return SDValue();
8916+
}
8917+
88928918
// Check if extracting from the same vector.
88938919
// For example,
88948920
// %sc = setcc %vector, %svn1, gt
@@ -8904,7 +8930,13 @@ performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
89048930
if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
89058931
(Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
89068932
(Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
8907-
(Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
8933+
(Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
8934+
(Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
8935+
CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
8936+
CC != ISD::SETGE) ||
8937+
(Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
8938+
CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
8939+
CC != ISD::SETLE))
89088940
return SDValue();
89098941

89108942
// Expect to check only lane 0 from the vector SETCC.
@@ -8963,6 +8995,9 @@ performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
89638995
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
89648996
return SDValue();
89658997

8998+
if (VTy.getSizeInBits() < 64)
8999+
return SDValue();
9000+
89669001
return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
89679002
}
89689003

test/CodeGen/AArch64/aarch64-minmaxv.ll

+32
Original file line numberDiff line numberDiff line change
@@ -285,3 +285,35 @@ define i64 @umin_D(<2 x i64>* nocapture readonly %arr) {
285285
%r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
286286
ret i64 %r
287287
}
288+
289+
; CHECK-LABEL: f_fmaxnmv
290+
; CHECK: fmaxnmv
291+
define float @f_fmaxnmv(<4 x float>* nocapture readonly %arr) {
292+
%rdx.minmax.select = load <4 x float>, <4 x float>* %arr
293+
%rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
294+
%rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
295+
%rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
296+
%rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
297+
%rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
298+
%rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
299+
%rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
300+
%rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
301+
%r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
302+
ret float %r
303+
}
304+
305+
; CHECK-LABEL: f_fminnmv
306+
; CHECK: fminnmv
307+
define float @f_fminnmv(<4 x float>* nocapture readonly %arr) {
308+
%rdx.minmax.select = load <4 x float>, <4 x float>* %arr
309+
%rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
310+
%rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
311+
%rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
312+
%rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
313+
%rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
314+
%rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
315+
%rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
316+
%rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
317+
%r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
318+
ret float %r
319+
}

0 commit comments

Comments
 (0)