diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index dff9049dacf3c..aa9032ea2574c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11501,28 +11501,28 @@ static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL, if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1) return SDValue(); - // select Cond, Cond, F --> or Cond, F - // select Cond, 1, F --> or Cond, F + // select Cond, Cond, F --> or Cond, freeze(F) + // select Cond, 1, F --> or Cond, freeze(F) if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true)) - return matcher.getNode(ISD::OR, DL, VT, Cond, F); + return matcher.getNode(ISD::OR, DL, VT, Cond, DAG.getFreeze(F)); - // select Cond, T, Cond --> and Cond, T - // select Cond, T, 0 --> and Cond, T + // select Cond, T, Cond --> and Cond, freeze(T) + // select Cond, T, 0 --> and Cond, freeze(T) if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true)) - return matcher.getNode(ISD::AND, DL, VT, Cond, T); + return matcher.getNode(ISD::AND, DL, VT, Cond, DAG.getFreeze(T)); - // select Cond, T, 1 --> or (not Cond), T + // select Cond, T, 1 --> or (not Cond), freeze(T) if (isOneOrOneSplat(F, /* AllowUndefs */ true)) { SDValue NotCond = matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT)); - return matcher.getNode(ISD::OR, DL, VT, NotCond, T); + return matcher.getNode(ISD::OR, DL, VT, NotCond, DAG.getFreeze(T)); } - // select Cond, 0, F --> and (not Cond), F + // select Cond, 0, F --> and (not Cond), freeze(F) if (isNullOrNullSplat(T, /* AllowUndefs */ true)) { SDValue NotCond = matcher.getNode(ISD::XOR, DL, VT, Cond, DAG.getAllOnesConstant(DL, VT)); - return matcher.getNode(ISD::AND, DL, VT, NotCond, F); + return matcher.getNode(ISD::AND, DL, VT, NotCond, DAG.getFreeze(F)); } return SDValue(); @@ -11550,37 +11550,37 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) { else return SDValue(); - // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1 + // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & freeze(N1) if (isNullOrNullSplat(N2)) { SDLoc DL(N); SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt); - return DAG.getNode(ISD::AND, DL, VT, Sra, N1); + return DAG.getNode(ISD::AND, DL, VT, Sra, DAG.getFreeze(N1)); } - // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2 + // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | freeze(N2) if (isAllOnesOrAllOnesSplat(N1)) { SDLoc DL(N); SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt); - return DAG.getNode(ISD::OR, DL, VT, Sra, N2); + return DAG.getNode(ISD::OR, DL, VT, Sra, DAG.getFreeze(N2)); } // If we have to invert the sign bit mask, only do that transform if the // target has a bitwise 'and not' instruction (the invert is free). - // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2 + // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & freeze(N2) const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) { SDLoc DL(N); SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt); SDValue Not = DAG.getNOT(DL, Sra, VT); - return DAG.getNode(ISD::AND, DL, VT, Not, N2); + return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2)); } // TODO: There's another pattern in this family, but it may require // implementing hasOrNot() to check for profitability: - // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2 + // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2) return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index 2ef35283568c3..1c8a8d635274e 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -229,7 +229,8 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x2, x2, x11 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d ; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] diff --git a/llvm/test/CodeGen/AArch64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/fast-isel-select.ll index 315ca4f285fd7..6e55bf4968e78 100644 --- a/llvm/test/CodeGen/AArch64/fast-isel-select.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-select.ll @@ -500,22 +500,11 @@ define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) { ; Test peephole optimizations for select. define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) { -; CHECK-SDAGISEL-LABEL: select_opt1: -; CHECK-SDAGISEL: ; %bb.0: -; CHECK-SDAGISEL-NEXT: orr w0, w0, w1 -; CHECK-SDAGISEL-NEXT: ret -; -; CHECK-FASTISEL-LABEL: select_opt1: -; CHECK-FASTISEL: ; %bb.0: -; CHECK-FASTISEL-NEXT: orr w8, w0, w1 -; CHECK-FASTISEL-NEXT: and w0, w8, #0x1 -; CHECK-FASTISEL-NEXT: ret -; -; CHECK-GISEL-LABEL: select_opt1: -; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: orr w8, w0, w1 -; CHECK-GISEL-NEXT: and w0, w8, #0x1 -; CHECK-GISEL-NEXT: ret +; CHECK-LABEL: select_opt1: +; CHECK: ; %bb.0: +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret %1 = select i1 %c, i1 true, i1 %a ret i1 %1 } @@ -523,8 +512,8 @@ define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) { define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) { ; CHECK-SDAGISEL-LABEL: select_opt2: ; CHECK-SDAGISEL: ; %bb.0: -; CHECK-SDAGISEL-NEXT: eor w8, w0, #0x1 -; CHECK-SDAGISEL-NEXT: orr w0, w8, w1 +; CHECK-SDAGISEL-NEXT: orn w8, w1, w0 +; CHECK-SDAGISEL-NEXT: and w0, w8, #0x1 ; CHECK-SDAGISEL-NEXT: ret ; ; CHECK-FASTISEL-LABEL: select_opt2: @@ -547,7 +536,8 @@ define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) { define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) { ; CHECK-SDAGISEL-LABEL: select_opt3: ; CHECK-SDAGISEL: ; %bb.0: -; CHECK-SDAGISEL-NEXT: bic w0, w1, w0 +; CHECK-SDAGISEL-NEXT: eor w8, w0, #0x1 +; CHECK-SDAGISEL-NEXT: and w0, w8, w1 ; CHECK-SDAGISEL-NEXT: ret ; ; CHECK-FASTISEL-LABEL: select_opt3: diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index cdf2a962f9322..66d670d0b796b 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -319,8 +319,9 @@ define i32 @ctz_nxv16i1_poison( %a) { define i32 @ctz_and_nxv16i1( %pg, %a, %b) { ; CHECK-LABEL: ctz_and_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b ; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmpne p2.b, p1/z, z0.b, z1.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p2.b ; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll index afe13851f0b95..0d7f230062650 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll @@ -24,7 +24,8 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, %i37, < ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z4.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z3.s ; CHECK-NEXT: add z0.d, z2.d, z1.d -; CHECK-NEXT: bic p2.b, p1/z, p1.b, p2.b +; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: and p2.b, p1/z, p1.b, p2.b ; CHECK-NEXT: mov z0.d, p2/m, z2.d ; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d ; CHECK-NEXT: uaddv d0, p0, z0.d diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index f0ab3a5342e01..fea1303d0a2b7 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -482,28 +482,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 @@ -514,7 +507,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec @@ -1048,10 +1040,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -2695,28 +2687,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 @@ -2727,7 +2712,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec @@ -3261,10 +3245,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index ae4d302e04a7c..5b39cc2e185b7 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -15,13 +15,16 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]] - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GCN-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]] + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY4]], killed [[COPY3]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -65,15 +68,16 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY4]], implicit-def dead $scc + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY killed [[S_LOAD_DWORDX2_IMM1]] + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY5]], implicit-def dead $scc ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY3]], killed [[S_MOV_B32_2]], implicit-def $scc ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY6]], killed [[COPY5]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -122,13 +126,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2 ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[S_LOAD_DWORD_IMM]], implicit-def dead $scc + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY7]], implicit-def dead $scc ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]] - ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]] + ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 99818df6175bd..667a3f398c08a 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -22,13 +22,13 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -394,13 +394,13 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -765,13 +765,13 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1123,13 +1123,13 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1509,13 +1509,13 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] @@ -1860,13 +1860,13 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc -; SDAG-NEXT: s_mov_b32 s5, -1 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: s_mov_b32 s7, -1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] ; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6b036f675929e..fe093d4ac8515 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -520,28 +520,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 @@ -552,7 +545,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec @@ -1086,10 +1078,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -1918,28 +1910,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[5:6], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[8:9], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 @@ -1950,7 +1935,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec @@ -2484,10 +2468,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 4449e4f2ea4ed..396c29512933c 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -7,20 +7,20 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; ; CHECK-NEXT: .reg .b32 %r<20>; -; CHECK-NEXT: .reg .b64 %rd<127>; +; CHECK-NEXT: .reg .b64 %rd<129>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: mov.u64 %rd117, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46; +; CHECK-NEXT: mov.u64 %rd119, 0; +; CHECK-NEXT: sub.cc.s64 %rd52, %rd119, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd53, %rd119, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; ; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1; ; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd119, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd119, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; ; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2; ; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2; @@ -43,109 +43,109 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; ; CHECK-NEXT: add.s64 %rd64, %rd63, 64; ; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: sub.cc.s64 %rd7, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd117, 0; -; CHECK-NEXT: setp.eq.s64 %p8, %rd8, 0; -; CHECK-NEXT: setp.ne.s64 %p9, %rd8, 0; +; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65; +; CHECK-NEXT: subc.cc.s64 %rd67, %rd119, 0; +; CHECK-NEXT: setp.eq.s64 %p8, %rd67, 0; +; CHECK-NEXT: setp.ne.s64 %p9, %rd67, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p9; -; CHECK-NEXT: setp.gt.u64 %p10, %rd7, 127; +; CHECK-NEXT: setp.gt.u64 %p10, %rd66, 127; ; CHECK-NEXT: selp.u32 %r6, -1, 0, %p10; ; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p8; ; CHECK-NEXT: and.b32 %r8, %r7, 1; ; CHECK-NEXT: setp.eq.b32 %p11, %r8, 1; ; CHECK-NEXT: or.pred %p12, %p5, %p11; -; CHECK-NEXT: xor.b64 %rd66, %rd7, 127; -; CHECK-NEXT: or.b64 %rd67, %rd66, %rd8; -; CHECK-NEXT: setp.eq.s64 %p13, %rd67, 0; -; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p12; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p12; +; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; +; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; +; CHECK-NEXT: setp.eq.s64 %p13, %rd69, 0; +; CHECK-NEXT: selp.b64 %rd128, 0, %rd4, %p12; +; CHECK-NEXT: selp.b64 %rd127, 0, %rd3, %p12; ; CHECK-NEXT: or.pred %p14, %p12, %p13; ; CHECK-NEXT: @%p14 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd119, %rd7, 1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd8, 0; -; CHECK-NEXT: or.b64 %rd70, %rd119, %rd120; -; CHECK-NEXT: setp.eq.s64 %p15, %rd70, 0; -; CHECK-NEXT: cvt.u32.u64 %r9, %rd7; +; CHECK-NEXT: add.cc.s64 %rd121, %rd66, 1; +; CHECK-NEXT: addc.cc.s64 %rd122, %rd67, 0; +; CHECK-NEXT: or.b64 %rd72, %rd121, %rd122; +; CHECK-NEXT: setp.eq.s64 %p15, %rd72, 0; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; ; CHECK-NEXT: mov.b32 %r10, 127; ; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd71, %rd4, %r11; +; CHECK-NEXT: shl.b64 %rd73, %rd4, %r11; ; CHECK-NEXT: mov.b32 %r12, 64; ; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd72, %rd3, %r13; -; CHECK-NEXT: or.b64 %rd73, %rd71, %rd72; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r13; +; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; ; CHECK-NEXT: mov.b32 %r14, 63; ; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd74, %rd3, %r15; +; CHECK-NEXT: shl.b64 %rd76, %rd3, %r15; ; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; -; CHECK-NEXT: selp.b64 %rd124, %rd74, %rd73, %p16; -; CHECK-NEXT: shl.b64 %rd123, %rd3, %r11; -; CHECK-NEXT: mov.u64 %rd114, %rd117; +; CHECK-NEXT: selp.b64 %rd126, %rd76, %rd75, %p16; +; CHECK-NEXT: shl.b64 %rd125, %rd3, %r11; +; CHECK-NEXT: mov.u64 %rd116, %rd119; ; CHECK-NEXT: @%p15 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd119; -; CHECK-NEXT: shr.u64 %rd77, %rd3, %r16; +; CHECK-NEXT: cvt.u32.u64 %r16, %rd121; +; CHECK-NEXT: shr.u64 %rd79, %rd3, %r16; ; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd78, %rd4, %r18; -; CHECK-NEXT: or.b64 %rd79, %rd77, %rd78; +; CHECK-NEXT: shl.b64 %rd80, %rd4, %r18; +; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; ; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd80, %rd4, %r19; +; CHECK-NEXT: shr.u64 %rd82, %rd4, %r19; ; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; -; CHECK-NEXT: selp.b64 %rd121, %rd80, %rd79, %p17; -; CHECK-NEXT: shr.u64 %rd122, %rd4, %r16; +; CHECK-NEXT: selp.b64 %rd123, %rd82, %rd81, %p17; +; CHECK-NEXT: shr.u64 %rd124, %rd4, %r16; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.u64 %rd114, 0; -; CHECK-NEXT: mov.u64 %rd117, %rd114; +; CHECK-NEXT: mov.u64 %rd116, 0; +; CHECK-NEXT: mov.u64 %rd119, %rd116; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd81, %rd121, 63; -; CHECK-NEXT: shl.b64 %rd82, %rd122, 1; -; CHECK-NEXT: or.b64 %rd83, %rd82, %rd81; -; CHECK-NEXT: shl.b64 %rd84, %rd121, 1; -; CHECK-NEXT: shr.u64 %rd85, %rd124, 63; -; CHECK-NEXT: or.b64 %rd86, %rd84, %rd85; -; CHECK-NEXT: shr.u64 %rd87, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd124, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd123, 1; -; CHECK-NEXT: or.b64 %rd123, %rd117, %rd90; -; CHECK-NEXT: or.b64 %rd124, %rd114, %rd89; -; CHECK-NEXT: sub.cc.s64 %rd91, %rd35, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd92, %rd36, %rd83; -; CHECK-NEXT: shr.s64 %rd93, %rd92, 63; -; CHECK-NEXT: and.b64 %rd117, %rd93, 1; -; CHECK-NEXT: and.b64 %rd94, %rd93, %rd5; -; CHECK-NEXT: and.b64 %rd95, %rd93, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd121, %rd86, %rd94; -; CHECK-NEXT: subc.cc.s64 %rd122, %rd83, %rd95; -; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; -; CHECK-NEXT: or.b64 %rd96, %rd119, %rd120; -; CHECK-NEXT: setp.eq.s64 %p18, %rd96, 0; +; CHECK-NEXT: shr.u64 %rd83, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd84, %rd124, 1; +; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; +; CHECK-NEXT: shl.b64 %rd86, %rd123, 1; +; CHECK-NEXT: shr.u64 %rd87, %rd126, 63; +; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; +; CHECK-NEXT: shr.u64 %rd89, %rd125, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd126, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd125, 1; +; CHECK-NEXT: or.b64 %rd125, %rd119, %rd92; +; CHECK-NEXT: or.b64 %rd126, %rd116, %rd91; +; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; +; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; +; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; +; CHECK-NEXT: and.b64 %rd119, %rd95, 1; +; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; +; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd123, %rd88, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd124, %rd85, %rd97; +; CHECK-NEXT: add.cc.s64 %rd121, %rd121, -1; +; CHECK-NEXT: addc.cc.s64 %rd122, %rd122, -1; +; CHECK-NEXT: or.b64 %rd98, %rd121, %rd122; +; CHECK-NEXT: setp.eq.s64 %p18, %rd98, 0; ; CHECK-NEXT: @%p18 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd97, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd98, %rd124, 1; -; CHECK-NEXT: or.b64 %rd99, %rd98, %rd97; -; CHECK-NEXT: shl.b64 %rd100, %rd123, 1; -; CHECK-NEXT: or.b64 %rd125, %rd117, %rd100; -; CHECK-NEXT: or.b64 %rd126, %rd114, %rd99; +; CHECK-NEXT: shr.u64 %rd99, %rd125, 63; +; CHECK-NEXT: shl.b64 %rd100, %rd126, 1; +; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; +; CHECK-NEXT: shl.b64 %rd102, %rd125, 1; +; CHECK-NEXT: or.b64 %rd127, %rd119, %rd102; +; CHECK-NEXT: or.b64 %rd128, %rd116, %rd101; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd101, %rd5, %rd125; -; CHECK-NEXT: mul.lo.s64 %rd102, %rd5, %rd126; -; CHECK-NEXT: add.s64 %rd103, %rd101, %rd102; -; CHECK-NEXT: mul.lo.s64 %rd104, %rd6, %rd125; +; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd127; +; CHECK-NEXT: mul.lo.s64 %rd104, %rd5, %rd128; ; CHECK-NEXT: add.s64 %rd105, %rd103, %rd104; -; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; -; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd111, %rd112}; +; CHECK-NEXT: mul.lo.s64 %rd106, %rd6, %rd127; +; CHECK-NEXT: add.s64 %rd107, %rd105, %rd106; +; CHECK-NEXT: mul.lo.s64 %rd108, %rd5, %rd127; +; CHECK-NEXT: sub.cc.s64 %rd109, %rd3, %rd108; +; CHECK-NEXT: subc.cc.s64 %rd110, %rd4, %rd107; +; CHECK-NEXT: xor.b64 %rd111, %rd109, %rd2; +; CHECK-NEXT: xor.b64 %rd112, %rd110, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd113, %rd111, %rd2; +; CHECK-NEXT: subc.cc.s64 %rd114, %rd112, %rd2; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd113, %rd114}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -156,7 +156,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; ; CHECK-NEXT: .reg .b32 %r<20>; -; CHECK-NEXT: .reg .b64 %rd<113>; +; CHECK-NEXT: .reg .b64 %rd<115>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0]; @@ -180,106 +180,106 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.u64 %rd103, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd103, 0; -; CHECK-NEXT: setp.eq.s64 %p6, %rd6, 0; -; CHECK-NEXT: setp.ne.s64 %p7, %rd6, 0; +; CHECK-NEXT: mov.u64 %rd105, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd105, 0; +; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0; +; CHECK-NEXT: setp.ne.s64 %p7, %rd57, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p7; -; CHECK-NEXT: setp.gt.u64 %p8, %rd5, 127; +; CHECK-NEXT: setp.gt.u64 %p8, %rd56, 127; ; CHECK-NEXT: selp.u32 %r6, -1, 0, %p8; ; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p6; ; CHECK-NEXT: and.b32 %r8, %r7, 1; ; CHECK-NEXT: setp.eq.b32 %p9, %r8, 1; ; CHECK-NEXT: or.pred %p10, %p3, %p9; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.s64 %p11, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p10; -; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p10; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.s64 %p11, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd114, 0, %rd42, %p10; +; CHECK-NEXT: selp.b64 %rd113, 0, %rd41, %p10; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: @%p12 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd105, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd105, %rd106; -; CHECK-NEXT: setp.eq.s64 %p13, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r9, %rd5; +; CHECK-NEXT: add.cc.s64 %rd107, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd108, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd107, %rd108; +; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; ; CHECK-NEXT: mov.b32 %r10, 127; ; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r11; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; ; CHECK-NEXT: mov.b32 %r12, 64; ; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r13; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: mov.b32 %r14, 63; ; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r15; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; ; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; -; CHECK-NEXT: selp.b64 %rd110, %rd64, %rd63, %p14; -; CHECK-NEXT: shl.b64 %rd109, %rd41, %r11; -; CHECK-NEXT: mov.u64 %rd100, %rd103; +; CHECK-NEXT: selp.b64 %rd112, %rd66, %rd65, %p14; +; CHECK-NEXT: shl.b64 %rd111, %rd41, %r11; +; CHECK-NEXT: mov.u64 %rd102, %rd105; ; CHECK-NEXT: @%p13 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd105; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r16; +; CHECK-NEXT: cvt.u32.u64 %r16, %rd107; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; ; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r18; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r19; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; ; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; -; CHECK-NEXT: selp.b64 %rd107, %rd70, %rd69, %p15; -; CHECK-NEXT: shr.u64 %rd108, %rd42, %r16; +; CHECK-NEXT: selp.b64 %rd109, %rd72, %rd71, %p15; +; CHECK-NEXT: shr.u64 %rd110, %rd42, %r16; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.u64 %rd100, 0; -; CHECK-NEXT: mov.u64 %rd103, %rd100; +; CHECK-NEXT: mov.u64 %rd102, 0; +; CHECK-NEXT: mov.u64 %rd105, %rd102; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd108, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd107, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd110, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd110, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd109, 1; -; CHECK-NEXT: or.b64 %rd109, %rd103, %rd80; -; CHECK-NEXT: or.b64 %rd110, %rd100, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd103, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; -; CHECK-NEXT: or.b64 %rd86, %rd105, %rd106; -; CHECK-NEXT: setp.eq.s64 %p16, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd110, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd109, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd112, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd111, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd112, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd111, 1; +; CHECK-NEXT: or.b64 %rd111, %rd105, %rd82; +; CHECK-NEXT: or.b64 %rd112, %rd102, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd105, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd109, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd110, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd107, %rd107, -1; +; CHECK-NEXT: addc.cc.s64 %rd108, %rd108, -1; +; CHECK-NEXT: or.b64 %rd88, %rd107, %rd108; +; CHECK-NEXT: setp.eq.s64 %p16, %rd88, 0; ; CHECK-NEXT: @%p16 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd110, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd109, 1; -; CHECK-NEXT: or.b64 %rd111, %rd103, %rd90; -; CHECK-NEXT: or.b64 %rd112, %rd100, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd111, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd112, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd111, 1; +; CHECK-NEXT: or.b64 %rd113, %rd105, %rd92; +; CHECK-NEXT: or.b64 %rd114, %rd102, %rd91; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd111; -; CHECK-NEXT: mul.lo.s64 %rd92, %rd3, %rd112; -; CHECK-NEXT: add.s64 %rd93, %rd91, %rd92; -; CHECK-NEXT: mul.lo.s64 %rd94, %rd4, %rd111; +; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd113; +; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd114; ; CHECK-NEXT: add.s64 %rd95, %rd93, %rd94; -; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; -; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd97, %rd98}; +; CHECK-NEXT: mul.lo.s64 %rd96, %rd4, %rd113; +; CHECK-NEXT: add.s64 %rd97, %rd95, %rd96; +; CHECK-NEXT: mul.lo.s64 %rd98, %rd3, %rd113; +; CHECK-NEXT: sub.cc.s64 %rd99, %rd41, %rd98; +; CHECK-NEXT: subc.cc.s64 %rd100, %rd42, %rd97; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd99, %rd100}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -325,19 +325,19 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; ; CHECK-NEXT: .reg .b32 %r<20>; -; CHECK-NEXT: .reg .b64 %rd<120>; +; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0]; ; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: mov.u64 %rd110, 0; -; CHECK-NEXT: sub.cc.s64 %rd52, %rd110, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd53, %rd110, %rd46; +; CHECK-NEXT: mov.u64 %rd112, 0; +; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; ; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1; ; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd54, %rd110, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd55, %rd110, %rd50; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50; ; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; ; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2; ; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2; @@ -362,101 +362,101 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd64, %r4; ; CHECK-NEXT: add.s64 %rd65, %rd64, 64; ; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7; -; CHECK-NEXT: sub.cc.s64 %rd7, %rd62, %rd66; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd110, 0; -; CHECK-NEXT: setp.eq.s64 %p8, %rd8, 0; -; CHECK-NEXT: setp.ne.s64 %p9, %rd8, 0; +; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66; +; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; +; CHECK-NEXT: setp.eq.s64 %p8, %rd68, 0; +; CHECK-NEXT: setp.ne.s64 %p9, %rd68, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p9; -; CHECK-NEXT: setp.gt.u64 %p10, %rd7, 127; +; CHECK-NEXT: setp.gt.u64 %p10, %rd67, 127; ; CHECK-NEXT: selp.u32 %r6, -1, 0, %p10; ; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p8; ; CHECK-NEXT: and.b32 %r8, %r7, 1; ; CHECK-NEXT: setp.eq.b32 %p11, %r8, 1; ; CHECK-NEXT: or.pred %p12, %p5, %p11; -; CHECK-NEXT: xor.b64 %rd67, %rd7, 127; -; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8; -; CHECK-NEXT: setp.eq.s64 %p13, %rd68, 0; -; CHECK-NEXT: selp.b64 %rd119, 0, %rd2, %p12; -; CHECK-NEXT: selp.b64 %rd118, 0, %rd1, %p12; +; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; +; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; +; CHECK-NEXT: setp.eq.s64 %p13, %rd70, 0; +; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p12; +; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p12; ; CHECK-NEXT: or.pred %p14, %p12, %p13; ; CHECK-NEXT: @%p14 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd112, %rd7, 1; -; CHECK-NEXT: addc.cc.s64 %rd113, %rd8, 0; -; CHECK-NEXT: or.b64 %rd71, %rd112, %rd113; -; CHECK-NEXT: setp.eq.s64 %p15, %rd71, 0; -; CHECK-NEXT: cvt.u32.u64 %r9, %rd7; +; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; +; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; +; CHECK-NEXT: setp.eq.s64 %p15, %rd73, 0; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd67; ; CHECK-NEXT: mov.b32 %r10, 127; ; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd72, %rd2, %r11; +; CHECK-NEXT: shl.b64 %rd74, %rd2, %r11; ; CHECK-NEXT: mov.b32 %r12, 64; ; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd73, %rd1, %r13; -; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73; +; CHECK-NEXT: shr.u64 %rd75, %rd1, %r13; +; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; ; CHECK-NEXT: mov.b32 %r14, 63; ; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd75, %rd1, %r15; +; CHECK-NEXT: shl.b64 %rd77, %rd1, %r15; ; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; -; CHECK-NEXT: selp.b64 %rd117, %rd75, %rd74, %p16; -; CHECK-NEXT: shl.b64 %rd116, %rd1, %r11; -; CHECK-NEXT: mov.u64 %rd107, %rd110; +; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p16; +; CHECK-NEXT: shl.b64 %rd118, %rd1, %r11; +; CHECK-NEXT: mov.u64 %rd109, %rd112; ; CHECK-NEXT: @%p15 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd112; -; CHECK-NEXT: shr.u64 %rd78, %rd1, %r16; +; CHECK-NEXT: cvt.u32.u64 %r16, %rd114; +; CHECK-NEXT: shr.u64 %rd80, %rd1, %r16; ; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd79, %rd2, %r18; -; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79; +; CHECK-NEXT: shl.b64 %rd81, %rd2, %r18; +; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; ; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd81, %rd2, %r19; +; CHECK-NEXT: shr.u64 %rd83, %rd2, %r19; ; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; -; CHECK-NEXT: selp.b64 %rd114, %rd81, %rd80, %p17; -; CHECK-NEXT: shr.u64 %rd115, %rd2, %r16; +; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p17; +; CHECK-NEXT: shr.u64 %rd117, %rd2, %r16; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.u64 %rd107, 0; -; CHECK-NEXT: mov.u64 %rd110, %rd107; +; CHECK-NEXT: mov.u64 %rd109, 0; +; CHECK-NEXT: mov.u64 %rd112, %rd109; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd82, %rd114, 63; -; CHECK-NEXT: shl.b64 %rd83, %rd115, 1; -; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82; -; CHECK-NEXT: shl.b64 %rd85, %rd114, 1; -; CHECK-NEXT: shr.u64 %rd86, %rd117, 63; -; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86; -; CHECK-NEXT: shr.u64 %rd88, %rd116, 63; -; CHECK-NEXT: shl.b64 %rd89, %rd117, 1; -; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88; -; CHECK-NEXT: shl.b64 %rd91, %rd116, 1; -; CHECK-NEXT: or.b64 %rd116, %rd110, %rd91; -; CHECK-NEXT: or.b64 %rd117, %rd107, %rd90; -; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87; -; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84; -; CHECK-NEXT: shr.s64 %rd94, %rd93, 63; -; CHECK-NEXT: and.b64 %rd110, %rd94, 1; -; CHECK-NEXT: and.b64 %rd95, %rd94, %rd3; -; CHECK-NEXT: and.b64 %rd96, %rd94, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd114, %rd87, %rd95; -; CHECK-NEXT: subc.cc.s64 %rd115, %rd84, %rd96; -; CHECK-NEXT: add.cc.s64 %rd112, %rd112, -1; -; CHECK-NEXT: addc.cc.s64 %rd113, %rd113, -1; -; CHECK-NEXT: or.b64 %rd97, %rd112, %rd113; -; CHECK-NEXT: setp.eq.s64 %p18, %rd97, 0; +; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; +; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; +; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; +; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; +; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; +; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; +; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; +; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; +; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; +; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; +; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; +; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; +; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; +; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; +; CHECK-NEXT: and.b64 %rd112, %rd96, 1; +; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; +; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; +; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; +; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; +; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; +; CHECK-NEXT: setp.eq.s64 %p18, %rd99, 0; ; CHECK-NEXT: @%p18 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd98, %rd116, 63; -; CHECK-NEXT: shl.b64 %rd99, %rd117, 1; -; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98; -; CHECK-NEXT: shl.b64 %rd101, %rd116, 1; -; CHECK-NEXT: or.b64 %rd118, %rd110, %rd101; -; CHECK-NEXT: or.b64 %rd119, %rd107, %rd100; +; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; +; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; +; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; +; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; +; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; ; CHECK-NEXT: $L__BB4_5: // %udiv-end -; CHECK-NEXT: xor.b64 %rd102, %rd118, %rd5; -; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd104, %rd102, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd105, %rd103, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd104, %rd105}; +; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; +; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd106, %rd107}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -467,7 +467,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; ; CHECK-NEXT: .reg .b32 %r<20>; -; CHECK-NEXT: .reg .b64 %rd<105>; +; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0]; @@ -491,98 +491,98 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.u64 %rd95, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0; -; CHECK-NEXT: setp.eq.s64 %p6, %rd6, 0; -; CHECK-NEXT: setp.ne.s64 %p7, %rd6, 0; +; CHECK-NEXT: mov.u64 %rd97, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; +; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0; +; CHECK-NEXT: setp.ne.s64 %p7, %rd57, 0; ; CHECK-NEXT: selp.u32 %r5, -1, 0, %p7; -; CHECK-NEXT: setp.gt.u64 %p8, %rd5, 127; +; CHECK-NEXT: setp.gt.u64 %p8, %rd56, 127; ; CHECK-NEXT: selp.u32 %r6, -1, 0, %p8; ; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p6; ; CHECK-NEXT: and.b32 %r8, %r7, 1; ; CHECK-NEXT: setp.eq.b32 %p9, %r8, 1; ; CHECK-NEXT: or.pred %p10, %p3, %p9; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.s64 %p11, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p10; -; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p10; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.s64 %p11, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p10; +; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p10; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: @%p12 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98; -; CHECK-NEXT: setp.eq.s64 %p13, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r9, %rd5; +; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; +; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; ; CHECK-NEXT: mov.b32 %r10, 127; ; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r11; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; ; CHECK-NEXT: mov.b32 %r12, 64; ; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r13; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: mov.b32 %r14, 63; ; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r15; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; ; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; -; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p14; -; CHECK-NEXT: shl.b64 %rd101, %rd41, %r11; -; CHECK-NEXT: mov.u64 %rd92, %rd95; +; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p14; +; CHECK-NEXT: shl.b64 %rd103, %rd41, %r11; +; CHECK-NEXT: mov.u64 %rd94, %rd97; ; CHECK-NEXT: @%p13 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd97; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r16; +; CHECK-NEXT: cvt.u32.u64 %r16, %rd99; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; ; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r18; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r19; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; ; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; -; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p15; -; CHECK-NEXT: shr.u64 %rd100, %rd42, %r16; +; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p15; +; CHECK-NEXT: shr.u64 %rd102, %rd42, %r16; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.u64 %rd92, 0; -; CHECK-NEXT: mov.u64 %rd95, %rd92; +; CHECK-NEXT: mov.u64 %rd94, 0; +; CHECK-NEXT: mov.u64 %rd97, %rd94; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd99, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd100, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd99, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd102, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd102, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd101, 1; -; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80; -; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd95, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1; -; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98; -; CHECK-NEXT: setp.eq.s64 %p16, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; +; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; +; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd97, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; +; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; +; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; +; CHECK-NEXT: setp.eq.s64 %p16, %rd88, 0; ; CHECK-NEXT: @%p16 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd102, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd101, 1; -; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90; -; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; +; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; +; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd103, %rd104}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd105, %rd106}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll index 3fa494e1a57dd..f9b9c8a69d431 100644 --- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll +++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll @@ -54,7 +54,8 @@ define i1 @pr85190(i64 %a) { ; CHECK-ZBB-NEXT: li a2, -1 ; CHECK-ZBB-NEXT: slli a2, a2, 63 ; CHECK-ZBB-NEXT: sub a2, a2, a1 -; CHECK-ZBB-NEXT: slt a0, a0, a2 +; CHECK-ZBB-NEXT: min a1, a2, zero +; CHECK-ZBB-NEXT: slt a0, a0, a1 ; CHECK-ZBB-NEXT: ret %or = or i64 %a, 7 %cmp1 = icmp slt i64 %a, 0 diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index 8a6a30318ae58..62f5d49192ea9 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -13,7 +13,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 define dso_local void @m() local_unnamed_addr #1 { ; CHECK-LABEL: m: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stmg %r13, %r15, 104(%r15) ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: lhrl %r1, f+4 ; CHECK-NEXT: sll %r1, 8 @@ -21,59 +20,66 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: ic %r1, 6(%r2) ; CHECK-NEXT: larl %r2, e ; CHECK-NEXT: lb %r0, 3(%r2) -; CHECK-NEXT: vlvgp %v0, %r0, %r1 -; CHECK-NEXT: vlvgp %v1, %r1, %r0 ; CHECK-NEXT: vlvgf %v1, %r1, 0 -; CHECK-NEXT: vlvgf %v1, %r1, 2 -; CHECK-NEXT: vlvgp %v2, %r1, %r1 -; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d +; CHECK-NEXT: vlvgf %v1, %r1, 1 +; CHECK-NEXT: larl %r2, .LCPI0_0 +; CHECK-NEXT: vl %v2, 0(%r2), 3 +; CHECK-NEXT: vlvgf %v1, %r1, 3 +; CHECK-NEXT: vlvgf %v3, %r1, 3 +; CHECK-NEXT: vlvgf %v0, %r1, 1 +; CHECK-NEXT: vperm %v4, %v1, %v0, %v2 +; CHECK-NEXT: vlvgf %v0, %r1, 3 ; CHECK-NEXT: nilh %r1, 255 ; CHECK-NEXT: chi %r1, 128 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36 -; CHECK-NEXT: vlvgf %v0, %r0, 0 -; CHECK-NEXT: vlvgf %v0, %r0, 2 -; CHECK-NEXT: vgbm %v3, 30583 -; CHECK-NEXT: vn %v0, %v0, %v3 -; CHECK-NEXT: vn %v1, %v1, %v3 -; CHECK-NEXT: vrepf %v2, %v2, 1 -; CHECK-NEXT: vn %v2, %v2, %v3 -; CHECK-NEXT: vrepif %v3, 127 -; CHECK-NEXT: vchlf %v1, %v1, %v3 -; CHECK-NEXT: vlgvf %r13, %v1, 0 -; CHECK-NEXT: vchlf %v2, %v2, %v3 +; CHECK-NEXT: vperm %v0, %v3, %v0, %v2 +; CHECK-NEXT: larl %r2, .LCPI0_1 +; CHECK-NEXT: vl %v5, 0(%r2), 3 +; CHECK-NEXT: vgbm %v6, 30583 +; CHECK-NEXT: vn %v0, %v0, %v6 +; CHECK-NEXT: vn %v4, %v4, %v6 +; CHECK-NEXT: vperm %v1, %v1, %v1, %v5 +; CHECK-NEXT: vn %v5, %v1, %v6 +; CHECK-NEXT: vperm %v1, %v0, %v3, %v2 +; CHECK-NEXT: vn %v2, %v1, %v6 +; CHECK-NEXT: vrepif %v1, 127 +; CHECK-NEXT: vchlf %v3, %v5, %v1 +; CHECK-NEXT: vlgvf %r3, %v3, 1 +; CHECK-NEXT: vlgvf %r2, %v3, 0 +; CHECK-NEXT: risbg %r2, %r2, 48, 176, 15 +; CHECK-NEXT: rosbg %r2, %r3, 49, 49, 14 +; CHECK-NEXT: vlgvf %r3, %v3, 2 +; CHECK-NEXT: rosbg %r2, %r3, 50, 50, 13 +; CHECK-NEXT: vlgvf %r3, %v3, 3 +; CHECK-NEXT: rosbg %r2, %r3, 51, 51, 12 +; CHECK-NEXT: vchlf %v3, %v4, %v1 +; CHECK-NEXT: vlgvf %r3, %v3, 0 +; CHECK-NEXT: rosbg %r2, %r3, 52, 52, 11 +; CHECK-NEXT: vlgvf %r3, %v3, 1 +; CHECK-NEXT: rosbg %r2, %r3, 53, 53, 10 +; CHECK-NEXT: vlgvf %r3, %v3, 2 +; CHECK-NEXT: rosbg %r2, %r3, 54, 54, 9 +; CHECK-NEXT: vlgvf %r3, %v3, 3 +; CHECK-NEXT: rosbg %r2, %r3, 55, 55, 8 +; CHECK-NEXT: vchlf %v2, %v2, %v1 +; CHECK-NEXT: vlgvf %r3, %v2, 0 +; CHECK-NEXT: rosbg %r2, %r3, 56, 56, 7 ; CHECK-NEXT: vlgvf %r3, %v2, 1 -; CHECK-NEXT: nilf %r3, 1 -; CHECK-NEXT: vlgvf %r4, %v2, 0 -; CHECK-NEXT: risbg %r2, %r4, 48, 176, 15 -; CHECK-NEXT: rosbg %r2, %r3, 32, 49, 14 -; CHECK-NEXT: vlgvf %r5, %v2, 2 -; CHECK-NEXT: nilf %r5, 1 -; CHECK-NEXT: rosbg %r2, %r5, 32, 50, 13 -; CHECK-NEXT: vlgvf %r14, %v2, 3 -; CHECK-NEXT: nilf %r14, 1 -; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12 -; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11 -; CHECK-NEXT: vlgvf %r13, %v1, 1 -; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10 -; CHECK-NEXT: vlgvf %r13, %v1, 2 -; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9 -; CHECK-NEXT: vlgvf %r13, %v1, 3 -; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8 -; CHECK-NEXT: vchlf %v0, %v0, %v3 -; CHECK-NEXT: vlgvf %r13, %v0, 0 -; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7 -; CHECK-NEXT: vlgvf %r13, %v0, 1 -; CHECK-NEXT: rosbg %r2, %r13, 57, 57, 6 -; CHECK-NEXT: vlgvf %r13, %v0, 2 -; CHECK-NEXT: rosbg %r2, %r13, 58, 58, 5 -; CHECK-NEXT: vlgvf %r13, %v0, 3 -; CHECK-NEXT: rosbg %r2, %r13, 59, 59, 4 -; CHECK-NEXT: nilf %r4, 1 -; CHECK-NEXT: rosbg %r2, %r4, 32, 60, 3 -; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2 -; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1 -; CHECK-NEXT: or %r2, %r14 +; CHECK-NEXT: rosbg %r2, %r3, 57, 57, 6 +; CHECK-NEXT: vlgvf %r3, %v2, 2 +; CHECK-NEXT: rosbg %r2, %r3, 58, 58, 5 +; CHECK-NEXT: vlgvf %r3, %v2, 3 +; CHECK-NEXT: rosbg %r2, %r3, 59, 59, 4 +; CHECK-NEXT: vchlf %v0, %v0, %v1 +; CHECK-NEXT: vlgvf %r3, %v0, 0 +; CHECK-NEXT: rosbg %r2, %r3, 60, 60, 3 +; CHECK-NEXT: vlgvf %r3, %v0, 1 +; CHECK-NEXT: rosbg %r2, %r3, 61, 61, 2 +; CHECK-NEXT: vlgvf %r3, %v0, 2 +; CHECK-NEXT: rosbg %r2, %r3, 62, 62, 1 +; CHECK-NEXT: vlgvf %r3, %v0, 3 +; CHECK-NEXT: rosbg %r2, %r3, 63, 63, 0 ; CHECK-NEXT: vlgvb %r4, %v0, 1 ; CHECK-NEXT: vlgvb %r3, %v0, 0 ; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15 @@ -116,7 +122,7 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: nr %r2, %r0 ; CHECK-NEXT: larl %r1, g ; CHECK-NEXT: stc %r2, 0(%r1) -; CHECK-NEXT: lmg %r13, %r15, 272(%r15) +; CHECK-NEXT: aghi %r15, 168 ; CHECK-NEXT: br %r14 entry: %n = alloca i32, align 4 diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll index 12aa101cb48c4..51da557c6c49f 100644 --- a/llvm/test/CodeGen/VE/Scalar/max.ll +++ b/llvm/test/CodeGen/VE/Scalar/max.ll @@ -281,11 +281,13 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) { ; CHECK-LABEL: maxi1: ; CHECK: # %bb.0: ; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: and %s0, 1, %s0 ; CHECK-NEXT: b.l.t (, %s10) ; ; OPT-LABEL: maxi1: ; OPT: # %bb.0: ; OPT-NEXT: or %s0, %s0, %s1 +; OPT-NEXT: and %s0, 1, %s0 ; OPT-NEXT: b.l.t (, %s10) %3 = xor i1 %1, true %4 = and i1 %3, %0 diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll index da92ebafd0590..69d5ce48601f8 100644 --- a/llvm/test/CodeGen/VE/Scalar/min.ll +++ b/llvm/test/CodeGen/VE/Scalar/min.ll @@ -278,6 +278,7 @@ define i32 @min2u32(i32, i32) { define zeroext i1 @mini1(i1 zeroext, i1 zeroext) { ; CHECK-LABEL: mini1: ; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: and %s2, %s1, %s0 ; CHECK-NEXT: cmov.w.ne %s2, %s1, %s0 ; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 @@ -285,6 +286,7 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) { ; ; OPT-LABEL: mini1: ; OPT: # %bb.0: +; OPT-NEXT: and %s0, %s0, (32)0 ; OPT-NEXT: and %s2, %s1, %s0 ; OPT-NEXT: cmov.w.ne %s2, %s1, %s0 ; OPT-NEXT: adds.w.zx %s0, %s2, (0)1 diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll index f18497065c943..d011efa5b6140 100644 --- a/llvm/test/CodeGen/X86/combine-pmadd.ll +++ b/llvm/test/CodeGen/X86/combine-pmadd.ll @@ -293,10 +293,41 @@ define i32 @combine_pmaddubsw_constant_sat() { ; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values. define i1 @pmaddwd_pcmpgt_infinite_loop() { -; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop: -; CHECK: # %bb.0: -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: retq +; SSE-LABEL: pmaddwd_pcmpgt_infinite_loop: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $1, %xmm1 +; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: pmaddwd_pcmpgt_infinite_loop: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: pmaddwd_pcmpgt_infinite_loop: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vtestps %xmm1, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> , <8 x i16> ) %2 = icmp eq <4 x i32> %1, %3 = select <4 x i1> %2, <4 x i32> , <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index aa7b77f01d5ba..1c303de55c95d 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,7 +177,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $152, %esp +; X86-NEXT: subl $156, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -273,42 +273,44 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: subl %edx, %edi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl %edi, %ecx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload ; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: cmovnel %esi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: cmovnel %esi, %ebx -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: xorl $127, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: je .LBB4_8 ; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -331,34 +333,34 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 144(%esp,%eax), %edx -; X86-NEXT: movl 148(%esp,%eax), %esi +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 148(%esp,%edi), %edx +; X86-NEXT: movl 152(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 140(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl 144(%esp,%edi), %eax +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: shrl %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: orl %edx, %ebp -; X86-NEXT: movl 136(%esp,%eax), %eax +; X86-NEXT: movl 140(%esp,%edi), %edx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx ; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader @@ -374,180 +376,176 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %al, %ch +; X86-NEXT: movb %dl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 100(%esp,%eax), %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %edx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: movl 104(%esp,%edx), %ebx +; X86-NEXT: movl 100(%esp,%edx), %edi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %esi, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esp,%eax), %ebx -; X86-NEXT: movl 92(%esp,%eax), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: shrl %cl, %eax +; X86-NEXT: shrdl %cl, %ebx, %ebp +; X86-NEXT: movl 92(%esp,%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%edx), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: addl %edx, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: shrl %cl, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %edx +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %ebp -; X86-NEXT: shldl $1, %ecx, %ebx -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl %edi, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: sbbl %esi, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl $-1, %esi -; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: jne .LBB4_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit ; X86-NEXT: shldl $1, %ebp, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: shldl $1, %ebx, %ebp -; X86-NEXT: orl %eax, %ebp +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shldl $1, %eax, %ebp +; X86-NEXT: orl %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ebx -; X86-NEXT: orl %eax, %ebx +; X86-NEXT: shldl $1, %esi, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: .LBB4_8: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: xorl %eax, %ebp -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %ecx, %ebp +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: subl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %ebp, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebp, %edi ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx @@ -557,12 +555,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull %esi, %edi +; X86-NEXT: imull %ebp, %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: imull %edx, %esi @@ -586,7 +584,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $152, %esp +; X86-NEXT: addl $156, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/pr64589.ll b/llvm/test/CodeGen/X86/pr64589.ll index 130ef517ae28e..d93d54f4c31d0 100644 --- a/llvm/test/CodeGen/X86/pr64589.ll +++ b/llvm/test/CodeGen/X86/pr64589.ll @@ -7,8 +7,8 @@ define i8 @test(ptr %p) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: orb 1(%rdi), %al +; CHECK-NEXT: movzbl 1(%rdi), %eax +; CHECK-NEXT: orb (%rdi), %al ; CHECK-NEXT: setne %al ; CHECK-NEXT: addb %al, %al ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index ec7dca4285a35..30202701fdb8c 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1550,26 +1550,40 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; ; SSE42-LABEL: select_v2i8: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: ptest %xmm1, %xmm1 +; SSE42-NEXT: movzwl (%rdi), %eax +; SSE42-NEXT: movd %eax, %xmm0 +; SSE42-NEXT: movzwl (%rsi), %eax +; SSE42-NEXT: movd %eax, %xmm1 +; SSE42-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: movmskpd %xmm0, %eax +; SSE42-NEXT: cmpl $3, %eax ; SSE42-NEXT: sete %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vptest %xmm0, %xmm0 -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: movzwl (%rdi), %eax +; AVX1OR2-NEXT: vmovd %eax, %xmm0 +; AVX1OR2-NEXT: movzwl (%rsi), %eax +; AVX1OR2-NEXT: vmovd %eax, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: ; AVX512: # %bb.0: ; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: cmpw (%rsi), %ax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb $3, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %v0 = load <2 x i8>, ptr %s0, align 1 diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index 951bcfa8fc1b7..2df39d69dbb75 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -1433,19 +1433,25 @@ define i1 @select_v2i8(ptr %s0, ptr %s1) { ; ; SSE42-LABEL: select_v2i8: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: movmskpd %xmm1, %eax +; SSE42-NEXT: movzwl (%rdi), %eax +; SSE42-NEXT: movd %eax, %xmm0 +; SSE42-NEXT: movzwl (%rsi), %eax +; SSE42-NEXT: movd %eax, %xmm1 +; SSE42-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: movmskpd %xmm0, %eax ; SSE42-NEXT: testl %eax, %eax ; SSE42-NEXT: setne %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: movzwl (%rdi), %eax +; AVX1OR2-NEXT: vmovd %eax, %xmm0 +; AVX1OR2-NEXT: movzwl (%rsi), %eax +; AVX1OR2-NEXT: vmovd %eax, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq