Skip to content

Commit dcc2525

Browse files
committed
[DAGCombiner][x86] add transform/hook to decompose integer multiply into shift/add
This is an alternative to D37896. I don't see a way to decompose multiplies generically without a target hook to tell us when it's profitable. ARM and AArch64 may be able to remove some duplicate code that overlaps with this transform. As a first step, we're only getting the most clear wins on the vector examples requested in PR34474: https://bugs.llvm.org/show_bug.cgi?id=34474 As noted in the code comment, it's likely that the x86 constraints are tighter than necessary, but it may not always be a win to replace a pmullw/pmulld. Differential Revision: https://reviews.llvm.org/D52195 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342554 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 68fc66e commit dcc2525

13 files changed

+351
-497
lines changed

include/llvm/CodeGen/TargetLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,6 +1721,15 @@ class TargetLoweringBase {
17211721
return false;
17221722
}
17231723

1724+
/// Return true if it is profitable to transform an integer
1725+
/// multiplication-by-constant into simpler operations like shifts and adds.
1726+
/// This may be true if the target does not directly support the
1727+
/// multiplication operation for the specified type or the sequence of simpler
1728+
/// ops is faster than the multiply.
1729+
virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
1730+
return false;
1731+
}
1732+
17241733
//===--------------------------------------------------------------------===//
17251734
// TargetLowering Configuration Methods - These methods should be invoked by
17261735
// the derived class constructor to configure this object for the target.

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2931,6 +2931,32 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
29312931
getShiftAmountTy(N0.getValueType()))));
29322932
}
29332933

2934+
// Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
2935+
// Examples: x * 33 --> (x << 5) + x
2936+
// x * 15 --> (x << 4) - x
2937+
if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
2938+
// TODO: Negative constants can be handled by negating the result.
2939+
// TODO: We could handle more general decomposition of any constant by
2940+
// having the target set a limit on number of ops and making a
2941+
// callback to determine that sequence (similar to sqrt expansion).
2942+
unsigned MathOp = ISD::DELETED_NODE;
2943+
if ((ConstValue1 - 1).isPowerOf2())
2944+
MathOp = ISD::ADD;
2945+
else if ((ConstValue1 + 1).isPowerOf2())
2946+
MathOp = ISD::SUB;
2947+
2948+
if (MathOp != ISD::DELETED_NODE) {
2949+
unsigned ShAmt = MathOp == ISD::ADD ? (ConstValue1 - 1).logBase2()
2950+
: (ConstValue1 + 1).logBase2();
2951+
assert(ShAmt > 0 && ShAmt < VT.getScalarSizeInBits() &&
2952+
"Not expecting multiply-by-constant that could have simplified");
2953+
SDLoc DL(N);
2954+
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0,
2955+
DAG.getConstant(ShAmt, DL, VT));
2956+
return DAG.getNode(MathOp, DL, VT, Shl, N0);
2957+
}
2958+
}
2959+
29342960
// (mul (shl X, c1), c2) -> (mul X, c2 << c1)
29352961
if (N0.getOpcode() == ISD::SHL &&
29362962
isConstantOrConstantVector(N1, /* NoOpaques */ true) &&

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4722,6 +4722,23 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
47224722
return true;
47234723
}
47244724

4725+
bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
4726+
// TODO: We handle scalars using custom code, but generic combining could make
4727+
// that unnecessary.
4728+
APInt MulC;
4729+
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
4730+
return false;
4731+
4732+
// If vector multiply is legal, assume that's faster than shl + add/sub.
4733+
// TODO: Multiply is a complex op with higher latency and lower througput in
4734+
// most implementations, so this check could be loosened based on type
4735+
// and/or a CPU attribute.
4736+
if (isOperationLegal(ISD::MUL, VT))
4737+
return false;
4738+
4739+
return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2();
4740+
}
4741+
47254742
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
47264743
unsigned Index) const {
47274744
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,8 @@ namespace llvm {
10341034

10351035
bool convertSelectOfConstantsToMath(EVT VT) const override;
10361036

1037+
bool decomposeMulByConstant(EVT VT, SDValue C) const override;
1038+
10371039
/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
10381040
/// with this index.
10391041
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

test/CodeGen/X86/urem-seteq-vec-nonsplat.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -524,14 +524,10 @@ define <4 x i32> @test_urem_comp_nonsplat(<4 x i32> %X) nounwind readnone {
524524
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
525525
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
526526
; CHECK-SSE2-NEXT: psrld $2, %xmm2
527-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
528-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
529-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
530-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
531-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
532-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
533-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
534-
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
527+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
528+
; CHECK-SSE2-NEXT: pslld $2, %xmm1
529+
; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
530+
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
535531
; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
536532
; CHECK-SSE2-NEXT: psrld $31, %xmm0
537533
; CHECK-SSE2-NEXT: retq
@@ -728,14 +724,10 @@ define <4 x i32> @test_urem_comp_undef(<4 x i32> %X) nounwind readnone {
728724
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
729725
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
730726
; CHECK-SSE2-NEXT: psrld $2, %xmm2
731-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
732-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
733-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
734-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
735-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
736-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
737-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
738-
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
727+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
728+
; CHECK-SSE2-NEXT: pslld $2, %xmm1
729+
; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
730+
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
739731
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
740732
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
741733
; CHECK-SSE2-NEXT: psrld $31, %xmm0

test/CodeGen/X86/urem-seteq-vec-splat.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,10 @@ define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone {
1919
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
2020
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2121
; CHECK-SSE2-NEXT: psrld $2, %xmm2
22-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
23-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
24-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
25-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
26-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
27-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
28-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
29-
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
22+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
23+
; CHECK-SSE2-NEXT: pslld $2, %xmm1
24+
; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
25+
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
3026
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
3127
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
3228
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -116,14 +112,10 @@ define <4 x i16> @test_urem_odd_vec_i16(<4 x i16> %X) nounwind readnone {
116112
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
117113
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
118114
; CHECK-SSE2-NEXT: psrld $2, %xmm2
119-
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,5,5,5]
120-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
121-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
122-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
123-
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
124-
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
125-
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
126-
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
115+
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
116+
; CHECK-SSE2-NEXT: pslld $2, %xmm1
117+
; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
118+
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
127119
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
128120
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
129121
; CHECK-SSE2-NEXT: psrld $31, %xmm0

test/CodeGen/X86/vector-idiv-sdiv-128.ll

Lines changed: 26 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -396,14 +396,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
396396
; SSE2-NEXT: psrld $31, %xmm1
397397
; SSE2-NEXT: psrad $2, %xmm2
398398
; SSE2-NEXT: paddd %xmm1, %xmm2
399-
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
400-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
401-
; SSE2-NEXT: pmuludq %xmm1, %xmm2
402-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
403-
; SSE2-NEXT: pmuludq %xmm1, %xmm3
404-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
405-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
406-
; SSE2-NEXT: psubd %xmm2, %xmm0
399+
; SSE2-NEXT: movdqa %xmm2, %xmm1
400+
; SSE2-NEXT: pslld $3, %xmm1
401+
; SSE2-NEXT: psubd %xmm1, %xmm2
402+
; SSE2-NEXT: paddd %xmm2, %xmm0
407403
; SSE2-NEXT: retq
408404
;
409405
; SSE41-LABEL: test_rem7_4i32:
@@ -511,16 +507,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
511507
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
512508
; SSE2-NEXT: paddb %xmm2, %xmm1
513509
; SSE2-NEXT: movdqa %xmm1, %xmm2
514-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
515-
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
516-
; SSE2-NEXT: pmullw %xmm3, %xmm2
517-
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
518-
; SSE2-NEXT: pand %xmm4, %xmm2
519-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
520-
; SSE2-NEXT: pmullw %xmm3, %xmm1
521-
; SSE2-NEXT: pand %xmm4, %xmm1
522-
; SSE2-NEXT: packuswb %xmm2, %xmm1
523-
; SSE2-NEXT: psubb %xmm1, %xmm0
510+
; SSE2-NEXT: psllw $3, %xmm2
511+
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
512+
; SSE2-NEXT: psubb %xmm2, %xmm1
513+
; SSE2-NEXT: paddb %xmm0, %xmm1
514+
; SSE2-NEXT: movdqa %xmm1, %xmm0
524515
; SSE2-NEXT: retq
525516
;
526517
; SSE41-LABEL: test_rem7_16i8:
@@ -544,16 +535,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
544535
; SSE41-NEXT: psrlw $7, %xmm1
545536
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
546537
; SSE41-NEXT: paddb %xmm2, %xmm1
547-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
548-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
549-
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
550-
; SSE41-NEXT: pmullw %xmm3, %xmm1
551-
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
552-
; SSE41-NEXT: pand %xmm4, %xmm1
553-
; SSE41-NEXT: pmullw %xmm3, %xmm2
554-
; SSE41-NEXT: pand %xmm4, %xmm2
555-
; SSE41-NEXT: packuswb %xmm1, %xmm2
556-
; SSE41-NEXT: psubb %xmm2, %xmm0
538+
; SSE41-NEXT: movdqa %xmm1, %xmm2
539+
; SSE41-NEXT: psllw $3, %xmm2
540+
; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
541+
; SSE41-NEXT: psubb %xmm2, %xmm1
542+
; SSE41-NEXT: paddb %xmm1, %xmm0
557543
; SSE41-NEXT: retq
558544
;
559545
; AVX1-LABEL: test_rem7_16i8:
@@ -576,16 +562,10 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
576562
; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
577563
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
578564
; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
579-
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
580-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
581-
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
582-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
583-
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
584-
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
585-
; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
586-
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
587-
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
588-
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
565+
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
566+
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
567+
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
568+
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
589569
; AVX1-NEXT: retq
590570
;
591571
; AVX2NOBW-LABEL: test_rem7_16i8:
@@ -604,14 +584,10 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
604584
; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
605585
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
606586
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
607-
; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
608-
; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
609-
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
610-
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
611-
; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
612-
; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
613-
; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
614-
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
587+
; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
588+
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
589+
; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
590+
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
615591
; AVX2NOBW-NEXT: vzeroupper
616592
; AVX2NOBW-NEXT: retq
617593
;
@@ -630,10 +606,10 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
630606
; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
631607
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
632608
; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
633-
; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
634-
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
635-
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
636-
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
609+
; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
610+
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
611+
; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
612+
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
637613
; AVX512BW-NEXT: vzeroupper
638614
; AVX512BW-NEXT: retq
639615
%res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>

0 commit comments

Comments
 (0)