diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9a37627e36b9f..f4dabbe6c9c73 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9854,6 +9854,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]); return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags); } + + if (VTList.VTs[0].isVector() && + VTList.VTs[0].getVectorElementType() == MVT::i1 && + VTList.VTs[1].getVectorElementType() == MVT::i1) { + SDValue F1 = getFreeze(N1); + SDValue F2 = getFreeze(N2); + // {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y)} + if (Opcode == ISD::UADDO || Opcode == ISD::SADDO) + return getNode(ISD::MERGE_VALUES, DL, VTList, + {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2), + getNode(ISD::AND, DL, VTList.VTs[1], F1, F2)}, + Flags); + // {vXi1,vXi1} (u/s)subo(vXi1 x, vXi1y) -> {xor(x,y),and(~x,y)} + if (Opcode == ISD::USUBO || Opcode == ISD::SSUBO) { + SDValue NotF1 = getNOT(DL, F1, VTList.VTs[0]); + return getNode(ISD::MERGE_VALUES, DL, VTList, + {getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2), + getNode(ISD::AND, DL, VTList.VTs[1], NotF1, F2)}, + Flags); + } + } break; } case ISD::SMUL_LOHI: diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 6ad880020cc66..00609b0df9b4e 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -245,21 +245,17 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-LABEL: uaddo_v4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4h, #1 +; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: shl v2.4h, v2.4h, #15 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: cmlt v1.4h, v2.4h, #0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: fmov d1, d0 -; CHECK-NEXT: shl v2.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v2.4h, v2.4h, #0 -; CHECK-NEXT: bic v1.4h, #2 -; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: and v1.8b, v2.8b, v1.8b -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: addv h1, v1.4h -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/pr69080.ll b/llvm/test/CodeGen/X86/pr69080.ll new file mode 100644 index 0000000000000..1b27adcb1ae7c --- /dev/null +++ b/llvm/test/CodeGen/X86/pr69080.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX + +define { <4 x i1>, <4 x i1> } @uaddo(<4 x i1> %a) { +; SSE-LABEL: uaddo: +; SSE: # %bb.0: +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: uaddo: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %f = call { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a) + ret { <4 x i1>, <4 x i1> } %f +} +declare { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) + +define { <4 x i1>, <4 x i1> } @saddo(<4 x i1> %a) { +; SSE-LABEL: saddo: +; SSE: # %bb.0: +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: saddo: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %f = call { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a) + ret { <4 x i1>, <4 x i1> } %f +} +declare { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 7631367ba5d66..eae9b969211f6 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -976,34 +976,24 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; SSE-LABEL: saddo_v4i1: ; SSE: # %bb.0: -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pslld $31, %xmm0 ; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: movmskps %xmm1, %eax -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movb %al, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: saddo_v4i1: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vmovmskps %xmm2, %eax +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpslld $31, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovmskps %xmm1, %eax ; AVX-NEXT: movb %al, (%rdi) ; AVX-NEXT: retq ; @@ -1011,11 +1001,10 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k2 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2} -; AVX512-NEXT: kxorw %k0, %k1, %k1 +; AVX512-NEXT: kandw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kshiftlw $12, %k2, %k0 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d634457069c0d..f8cf543cb9fab 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -985,34 +985,24 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; SSE-LABEL: ssubo_v4i1: ; SSE: # %bb.0: -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pslld $31, %xmm0 ; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pslld $31, %xmm1 -; SSE-NEXT: movmskps %xmm1, %eax -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movb %al, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: ssubo_v4i1: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vmovmskps %xmm2, %eax +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpslld $31, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovmskps %xmm1, %eax ; AVX-NEXT: movb %al, (%rdi) ; AVX-NEXT: retq ; @@ -1022,11 +1012,11 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1} ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 653c3a9969151..950e943bd9020 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -1075,49 +1075,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; SSE-LABEL: uaddo_v4i1: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pslld $31, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: psrad $31, %xmm0 ; SSE-NEXT: movb %al, (%rdi) -; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: uaddo_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: uaddo_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: uaddo_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vmovmskps %xmm2, %eax +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX-NEXT: movb %al, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: uaddo_v4i1: ; AVX512: # %bb.0: @@ -1125,11 +1102,11 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: kandnw %k0, %k1, %k2 +; AVX512-NEXT: kxorw %k1, %k0, %k2 +; AVX512-NEXT: kandw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k2, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index a58c3dd0d5307..7de972770d8da 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -1122,49 +1122,26 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; SSE-LABEL: usubo_v4i1: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: movmskps %xmm2, %eax +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pslld $31, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: psrad $31, %xmm0 ; SSE-NEXT: movb %al, (%rdi) -; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: usubo_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: usubo_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: usubo_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vmovmskps %xmm2, %eax +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX-NEXT: movb %al, (%rdi) +; AVX-NEXT: retq ; ; AVX512-LABEL: usubo_v4i1: ; AVX512: # %bb.0: @@ -1172,11 +1149,11 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1} ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} -; AVX512-NEXT: kshiftlw $12, %k1, %k0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi)