Skip to content

Commit cc21aa1

Browse files
committed
[X86] lower1BitShuffle - fold permute(setcc(x,y)) -> setcc(permute(x),permute(y)) for 32/64-bit element vectors
Noticed in #77459 - for wider element types, its usually better to pre-shuffle the comparison arguments if we can, like we already for broadcasts
1 parent 7775375 commit cc21aa1

File tree

3 files changed

+22
-36
lines changed

3 files changed

+22
-36
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17224,6 +17224,7 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
1722417224
"Cannot lower 512-bit vectors w/o basic ISA!");
1722517225

1722617226
int NumElts = Mask.size();
17227+
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
1722717228

1722817229
// Try to recognize shuffles that are just padding a subvector with zeros.
1722917230
int SubvecElts = 0;
@@ -17289,17 +17290,18 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
1728917290
Offset += NumElts; // Increment for next iteration.
1729017291
}
1729117292

17292-
// If we're broadcasting a SETCC result, try to broadcast the ops instead.
17293+
// If we're performing an unary shuffle on a SETCC result, try to shuffle the
17294+
// ops instead.
1729317295
// TODO: What other unary shuffles would benefit from this?
17294-
if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
17295-
V1->hasOneUse()) {
17296+
if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
1729617297
SDValue Op0 = V1.getOperand(0);
1729717298
SDValue Op1 = V1.getOperand(1);
1729817299
ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
1729917300
EVT OpVT = Op0.getValueType();
17300-
return DAG.getSetCC(
17301-
DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17302-
DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17301+
if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17302+
return DAG.getSetCC(
17303+
DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17304+
DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
1730317305
}
1730417306

1730517307
MVT ExtVT;

llvm/test/CodeGen/X86/pr77459.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
4242
;
4343
; AVX512-LABEL: reverse_cmp_v4i1:
4444
; AVX512: # %bb.0:
45-
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
46-
; AVX512-NEXT: vpmovm2d %k0, %xmm0
45+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
4746
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
48-
; AVX512-NEXT: vpmovd2m %xmm0, %k0
47+
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
4948
; AVX512-NEXT: kmovd %k0, %eax
5049
; AVX512-NEXT: # kill: def $al killed $al killed $eax
5150
; AVX512-NEXT: retq

llvm/test/CodeGen/X86/vector-shuffle-v1.ll

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
99
; AVX512F-LABEL: shuf2i1_1_0:
1010
; AVX512F: # %bb.0:
1111
; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0
12-
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
13-
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1412
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1513
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
1614
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -21,19 +19,15 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
2119
; AVX512VL-LABEL: shuf2i1_1_0:
2220
; AVX512VL: # %bb.0:
2321
; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
22+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2423
; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
2524
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
26-
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
27-
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
28-
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
2925
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
3026
; AVX512VL-NEXT: retq
3127
;
3228
; VL_BW_DQ-LABEL: shuf2i1_1_0:
3329
; VL_BW_DQ: # %bb.0:
3430
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
35-
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
36-
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
3731
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3832
; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
3933
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
@@ -86,10 +80,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
8680
define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
8781
; AVX512F-LABEL: shuf4i1_3_2_10:
8882
; AVX512F: # %bb.0:
89-
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
90-
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
91-
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
9283
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
84+
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
9385
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
9486
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
9587
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -98,21 +90,17 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
9890
;
9991
; AVX512VL-LABEL: shuf4i1_3_2_10:
10092
; AVX512VL: # %bb.0:
93+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
10194
; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
10295
; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
10396
; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
104-
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
105-
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
106-
; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
10797
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
10898
; AVX512VL-NEXT: retq
10999
;
110100
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
111101
; VL_BW_DQ: # %bb.0:
112-
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
113-
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
114-
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
115102
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
103+
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
116104
; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
117105
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
118106
; VL_BW_DQ-NEXT: retq
@@ -123,11 +111,10 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
123111
define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
124112
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
125113
; AVX512F: # %bb.0:
126-
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
127-
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
128114
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
115+
; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2
129116
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
130-
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
117+
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
131118
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
132119
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
133120
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -136,24 +123,22 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
136123
;
137124
; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
138125
; AVX512VL: # %bb.0:
126+
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
127+
; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2
128+
; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
139129
; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
140130
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
141-
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
142-
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
143-
; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
144-
; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
145131
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
146132
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
147133
; AVX512VL-NEXT: vzeroupper
148134
; AVX512VL-NEXT: retq
149135
;
150136
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
151137
; VL_BW_DQ: # %bb.0:
138+
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
139+
; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2
140+
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
152141
; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
153-
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
154-
; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
155-
; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
156-
; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
157142
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
158143
; VL_BW_DQ-NEXT: vzeroupper
159144
; VL_BW_DQ-NEXT: retq

0 commit comments

Comments
 (0)