diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index fb4da2c11cda7..49ce4b660c3ae 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -110,6 +110,7 @@ class CopyTracker { struct CopyInfo { MachineInstr *MI = nullptr; MachineInstr *LastSeenUseInCopy = nullptr; + SmallPtrSet SrcUsers; SmallVector DefRegs; bool Avail = false; }; @@ -224,6 +225,43 @@ class CopyTracker { } } + /// Track copy's src users, and return false if that can't be done. + /// We can only track if we have a COPY instruction which source is + /// the same as the Reg. + bool trackSrcUsers(MCRegister Reg, MachineInstr &MI, + const TargetRegisterInfo &TRI, const TargetInstrInfo &TII, + bool UseCopyInstr) { + MCRegUnit RU = *TRI.regunits(Reg).begin(); + MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI); + if (!AvailCopy) + return false; + + std::optional CopyOperands = + isCopyInstr(*AvailCopy, TII, UseCopyInstr); + Register Src = CopyOperands->Source->getReg(); + + // Bail out, if the source of the copy is not the same as the Reg. + if (Src != Reg) + return false; + + auto I = Copies.find(RU); + if (I == Copies.end()) + return false; + + I->second.SrcUsers.insert(&MI); + return true; + } + + /// Return the users for a given register. + SmallPtrSet getSrcUsers(MCRegister Reg, + const TargetRegisterInfo &TRI) { + MCRegUnit RU = *TRI.regunits(Reg).begin(); + auto I = Copies.find(RU); + if (I == Copies.end()) + return {}; + return I->second.SrcUsers; + } + /// Add this copy's registers into the tracker's copy maps. void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI, const TargetInstrInfo &TII, bool UseCopyInstr) { @@ -236,7 +274,7 @@ class CopyTracker { // Remember Def is defined by the copy. for (MCRegUnit Unit : TRI.regunits(Def)) - Copies[Unit] = {MI, nullptr, {}, true}; + Copies[Unit] = {MI, nullptr, {}, {}, true}; // Remember source that's copied to Def. Once it's clobbered, then // it's no longer available for copy propagation. @@ -427,6 +465,8 @@ class MachineCopyPropagation : public MachineFunctionPass { bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use); bool hasOverlappingMultipleDef(const MachineInstr &MI, const MachineOperand &MODef, Register Def); + bool canUpdateSrcUsers(const MachineInstr &Copy, + const MachineOperand &CopySrc); /// Candidates for deletion. SmallSetVector MaybeDeadCopies; @@ -667,6 +707,27 @@ bool MachineCopyPropagation::hasOverlappingMultipleDef( return false; } +/// Return true if it is safe to update all users of the \p CopySrc register +/// in the given \p Copy instruction. +bool MachineCopyPropagation::canUpdateSrcUsers(const MachineInstr &Copy, + const MachineOperand &CopySrc) { + assert(CopySrc.isReg() && "Expected a register operand"); + for (auto *SrcUser : Tracker.getSrcUsers(CopySrc.getReg(), *TRI)) { + if (hasImplicitOverlap(*SrcUser, CopySrc)) + return false; + + for (MachineOperand &MO : SrcUser->uses()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != CopySrc.getReg()) + continue; + if (MO.isTied() || !MO.isRenamable() || + !isBackwardPropagatableRegClassCopy(Copy, *SrcUser, + MO.getOperandNo())) + return false; + } + } + return true; +} + /// Look for available copies whose destination register is used by \p MI and /// replace the use in \p MI with the copy's source register. void MachineCopyPropagation::forwardUses(MachineInstr &MI) { @@ -1033,6 +1094,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { if (hasOverlappingMultipleDef(MI, MODef, Def)) continue; + if (!canUpdateSrcUsers(*Copy, *CopyOperands->Source)) + continue; + LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); @@ -1040,6 +1104,15 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { MODef.setReg(Def); MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); + for (auto *SrcUser : Tracker.getSrcUsers(Src, *TRI)) { + for (MachineOperand &MO : SrcUser->uses()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() != Src) + continue; + MO.setReg(Def); + MO.setIsRenamable(CopyOperands->Destination->isRenamable()); + } + } + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); MaybeDeadCopies.insert(Copy); Changed = true; @@ -1105,7 +1178,9 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( CopyDbgUsers[Copy].insert(&MI); } } - } else { + } else if (!Tracker.trackSrcUsers(MO.getReg().asMCReg(), MI, *TRI, *TII, + UseCopyInstr)) { + // If we can't track the source users, invalidate the register. Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII, UseCopyInstr); } diff --git a/llvm/test/CodeGen/AArch64/machine-cp-backward-uses.mir b/llvm/test/CodeGen/AArch64/machine-cp-backward-uses.mir new file mode 100644 index 0000000000000..e186799c81ce0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-cp-backward-uses.mir @@ -0,0 +1,58 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s + +# Normal case +--- +name: test1 +body: | + bb.0: + liveins: $w2 + ; CHECK-LABEL: name: test1 + ; CHECK: liveins: $w2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $w0 = MOVi32imm 5 + ; CHECK-NEXT: renamable $w3 = ADDWrr renamable $w0, killed renamable $w2 + ; CHECK-NEXT: RET_ReallyLR implicit killed $w0 + renamable $w1 = MOVi32imm 5 + renamable $w3 = ADDWrr renamable $w1, killed renamable $w2 + renamable $w0 = COPY killed renamable $w1 + RET_ReallyLR implicit killed $w0 +... + +# Not renamable use +--- +name: test2 +body: | + bb.0: + liveins: $w2 + ; CHECK-LABEL: name: test2 + ; CHECK: liveins: $w2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $w1 = MOVi32imm 5 + ; CHECK-NEXT: renamable $w3 = ADDWrr $w1, killed renamable $w2 + ; CHECK-NEXT: renamable $w0 = COPY killed renamable $w1 + ; CHECK-NEXT: RET_ReallyLR implicit killed $w0 + renamable $w1 = MOVi32imm 5 + renamable $w3 = ADDWrr $w1, killed renamable $w2 + renamable $w0 = COPY killed renamable $w1 + RET_ReallyLR implicit killed $w0 +... + +# Implicit use +--- +name: test3 +body: | + bb.0: + liveins: $w2 + ; CHECK-LABEL: name: test3 + ; CHECK: liveins: $w2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $w1 = MOVi32imm 5 + ; CHECK-NEXT: renamable $w3 = ADDWrr renamable $w1, killed renamable $w2, implicit $w1 + ; CHECK-NEXT: renamable $w0 = COPY killed renamable $w1 + ; CHECK-NEXT: RET_ReallyLR implicit killed $w0 + renamable $w1 = MOVi32imm 5 + renamable $w3 = ADDWrr renamable $w1, killed renamable $w2, implicit $w1 + renamable $w0 = COPY killed renamable $w1 + RET_ReallyLR implicit killed $w0 +... diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll index afd75940b4593..464808ec8861b 100644 --- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll @@ -7,12 +7,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; ARMV6: @ %bb.0: @ %start ; ARMV6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV6-NEXT: sub sp, sp, #28 -; ARMV6-NEXT: ldr r7, [sp, #72] +; ARMV6-NEXT: ldr lr, [sp, #72] ; ARMV6-NEXT: mov r6, r0 ; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV6-NEXT: ldr r4, [sp, #84] -; ARMV6-NEXT: umull r1, r0, r2, r7 -; ARMV6-NEXT: mov lr, r7 +; ARMV6-NEXT: umull r1, r0, r2, lr ; ARMV6-NEXT: umull r5, r10, r4, r2 ; ARMV6-NEXT: str r1, [r6] ; ARMV6-NEXT: ldr r6, [sp, #80] diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll index 8d548861f4393..72cead18f89fa 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -388,9 +388,8 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__divdi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__divdi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -405,9 +404,8 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__divdi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__divdi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -549,65 +547,59 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__divti3)($16) +; MMR3-NEXT: lw $25, %call16(__divti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: sdiv_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__divti3)($16) +; MMR6-NEXT: lw $25, %call16(__divti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = sdiv i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll index 29cb34b8d970f..72496fcc53a5a 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll @@ -336,9 +336,8 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__moddi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__moddi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -353,9 +352,8 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__moddi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__moddi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -497,65 +495,59 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__modti3)($16) +; MMR3-NEXT: lw $25, %call16(__modti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: srem_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__modti3)($16) +; MMR6-NEXT: lw $25, %call16(__modti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = srem i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll index cc2c6614e69c8..9451f1e9be096 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -336,9 +336,8 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__udivdi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__udivdi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -353,9 +352,8 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__udivdi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__udivdi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -497,65 +495,59 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__udivti3)($16) +; MMR3-NEXT: lw $25, %call16(__udivti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: udiv_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__udivti3)($16) +; MMR6-NEXT: lw $25, %call16(__udivti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = udiv i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll index 5da1f614b8f15..7bd8df9e48c75 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll @@ -428,9 +428,8 @@ define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { ; MMR3-NEXT: .cfi_def_cfa_offset 24 ; MMR3-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 -; MMR3-NEXT: addu $2, $2, $25 -; MMR3-NEXT: lw $25, %call16(__umoddi3)($2) -; MMR3-NEXT: move $gp, $2 +; MMR3-NEXT: addu $gp, $2, $25 +; MMR3-NEXT: lw $25, %call16(__umoddi3)($gp) ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop ; MMR3-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -445,9 +444,8 @@ define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { ; MMR6-NEXT: .cfi_def_cfa_offset 24 ; MMR6-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 -; MMR6-NEXT: addu $2, $2, $25 -; MMR6-NEXT: lw $25, %call16(__umoddi3)($2) -; MMR6-NEXT: move $gp, $2 +; MMR6-NEXT: addu $gp, $2, $25 +; MMR6-NEXT: lw $25, %call16(__umoddi3)($gp) ; MMR6-NEXT: jalr $25 ; MMR6-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 24 @@ -589,65 +587,59 @@ define signext i128 @urem_i128(i128 signext %a, i128 signext %b) { ; MMR3: # %bb.0: # %entry ; MMR3-NEXT: lui $2, %hi(_gp_disp) ; MMR3-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR3-NEXT: addiusp -48 -; MMR3-NEXT: .cfi_def_cfa_offset 48 -; MMR3-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR3-NEXT: swp $16, 36($sp) +; MMR3-NEXT: addiusp -40 +; MMR3-NEXT: .cfi_def_cfa_offset 40 +; MMR3-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR3-NEXT: .cfi_offset 31, -4 ; MMR3-NEXT: .cfi_offset 17, -8 -; MMR3-NEXT: .cfi_offset 16, -12 -; MMR3-NEXT: addu $16, $2, $25 +; MMR3-NEXT: addu $gp, $2, $25 ; MMR3-NEXT: move $1, $7 -; MMR3-NEXT: lw $7, 68($sp) -; MMR3-NEXT: lw $17, 72($sp) -; MMR3-NEXT: lw $3, 76($sp) +; MMR3-NEXT: lw $7, 60($sp) +; MMR3-NEXT: lw $17, 64($sp) +; MMR3-NEXT: lw $3, 68($sp) ; MMR3-NEXT: move $2, $sp ; MMR3-NEXT: sw16 $3, 28($2) ; MMR3-NEXT: sw16 $17, 24($2) ; MMR3-NEXT: sw16 $7, 20($2) -; MMR3-NEXT: lw $3, 64($sp) +; MMR3-NEXT: lw $3, 56($sp) ; MMR3-NEXT: sw16 $3, 16($2) -; MMR3-NEXT: lw $25, %call16(__umodti3)($16) +; MMR3-NEXT: lw $25, %call16(__umodti3)($gp) ; MMR3-NEXT: move $7, $1 -; MMR3-NEXT: move $gp, $16 ; MMR3-NEXT: jalr $25 ; MMR3-NEXT: nop -; MMR3-NEXT: lwp $16, 36($sp) -; MMR3-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR3-NEXT: addiusp 48 +; MMR3-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra ; ; MMR6-LABEL: urem_i128: ; MMR6: # %bb.0: # %entry ; MMR6-NEXT: lui $2, %hi(_gp_disp) ; MMR6-NEXT: addiu $2, $2, %lo(_gp_disp) -; MMR6-NEXT: addiu $sp, $sp, -48 -; MMR6-NEXT: .cfi_def_cfa_offset 48 -; MMR6-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $17, 40($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -40 +; MMR6-NEXT: .cfi_def_cfa_offset 40 +; MMR6-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $17, 32($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 31, -4 ; MMR6-NEXT: .cfi_offset 17, -8 -; MMR6-NEXT: .cfi_offset 16, -12 -; MMR6-NEXT: addu $16, $2, $25 +; MMR6-NEXT: addu $gp, $2, $25 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: lw $7, 68($sp) -; MMR6-NEXT: lw $17, 72($sp) -; MMR6-NEXT: lw $3, 76($sp) +; MMR6-NEXT: lw $7, 60($sp) +; MMR6-NEXT: lw $17, 64($sp) +; MMR6-NEXT: lw $3, 68($sp) ; MMR6-NEXT: move $2, $sp ; MMR6-NEXT: sw16 $3, 28($2) ; MMR6-NEXT: sw16 $17, 24($2) ; MMR6-NEXT: sw16 $7, 20($2) -; MMR6-NEXT: lw $3, 64($sp) +; MMR6-NEXT: lw $3, 56($sp) ; MMR6-NEXT: sw16 $3, 16($2) -; MMR6-NEXT: lw $25, %call16(__umodti3)($16) +; MMR6-NEXT: lw $25, %call16(__umodti3)($gp) ; MMR6-NEXT: move $7, $1 -; MMR6-NEXT: move $gp, $16 ; MMR6-NEXT: jalr $25 -; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 40($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 48 +; MMR6-NEXT: lw $17, 32($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra entry: %r = urem i128 %a, %b diff --git a/llvm/test/CodeGen/Mips/mcount.ll b/llvm/test/CodeGen/Mips/mcount.ll index ee0409a021c87..8a129536d9769 100644 --- a/llvm/test/CodeGen/Mips/mcount.ll +++ b/llvm/test/CodeGen/Mips/mcount.ll @@ -104,9 +104,8 @@ define void @foo() { ; MIPS32-MM-PIC-NEXT: .cfi_def_cfa_offset 24 ; MIPS32-MM-PIC-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-MM-PIC-NEXT: .cfi_offset 31, -4 -; MIPS32-MM-PIC-NEXT: addu $2, $2, $25 -; MIPS32-MM-PIC-NEXT: lw $25, %call16(_mcount)($2) -; MIPS32-MM-PIC-NEXT: move $gp, $2 +; MIPS32-MM-PIC-NEXT: addu $gp, $2, $25 +; MIPS32-MM-PIC-NEXT: lw $25, %call16(_mcount)($gp) ; MIPS32-MM-PIC-NEXT: move $1, $ra ; MIPS32-MM-PIC-NEXT: .reloc ($tmp0), R_MICROMIPS_JALR, _mcount ; MIPS32-MM-PIC-NEXT: $tmp0: diff --git a/llvm/test/CodeGen/Mips/micromips-gp-rc.ll b/llvm/test/CodeGen/Mips/micromips-gp-rc.ll index 42f65463cf1cd..95f557770a10d 100644 --- a/llvm/test/CodeGen/Mips/micromips-gp-rc.ll +++ b/llvm/test/CodeGen/Mips/micromips-gp-rc.ll @@ -14,5 +14,6 @@ entry: ; Function Attrs: noreturn declare void @exit(i32 signext) -; CHECK: move $gp, ${{[0-9]+}} - +; CHECK: addu $gp, ${{[0-9]+}}, ${{[0-9]+}} +; CHECK: lw ${{[0-9]+}}, %got(g)($gp) +; CHECK: lw ${{[0-9]+}}, %call16(exit)($gp) diff --git a/llvm/test/CodeGen/Mips/tailcall/tailcall.ll b/llvm/test/CodeGen/Mips/tailcall/tailcall.ll index 3b200780b9f59..811fe0f68e750 100644 --- a/llvm/test/CodeGen/Mips/tailcall/tailcall.ll +++ b/llvm/test/CodeGen/Mips/tailcall/tailcall.ll @@ -25,7 +25,7 @@ ; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefix=STATIC64 ; RUN: llc -mtriple=mipsel -relocation-model=pic -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs \ -; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,PIC32MM +; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,PIC32MMR6 ; RUN: llc -mtriple=mipsel -relocation-model=static -mcpu=mips32r6 -verify-machineinstrs \ ; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32MMR6 @@ -44,7 +44,8 @@ define i32 @caller1(i32 %a0) nounwind { entry: ; ALL-LABEL: caller1: ; PIC32: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalrs16 $25 +; PIC32MMR6: jalr $25 ; PIC32R6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc @@ -63,6 +64,7 @@ entry: ; ALL-LABEL: caller2 ; PIC32: jalr $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; PIC32R6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc @@ -82,6 +84,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; N64: jalr $25 @@ -100,6 +103,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; SATATIC32MMR6: jal ; PIC64: jalr $25 @@ -119,6 +123,7 @@ entry: ; PIC32: jr $25 ; PIC32R6: jr $25 ; PIC32MM: jr +; PIC32MMR6: jr ; STATIC32: j ; STATIC32MMR6: bc ; PIC64: jr $25 @@ -161,6 +166,7 @@ entry: ; PIC32: jr $25 ; PIC32R6: jrc $25 ; PIC32MM: jrc +; PIC32MMR6: jrc ; STATIC32: j ; STATIC32MMR6: bc ; PIC64: jr $25 @@ -177,7 +183,8 @@ entry: ; ALL-LABEL: caller8_1: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalrs16 $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; PIC64: jalr $25 @@ -200,6 +207,7 @@ entry: ; PIC32: jr $25 ; PIC32R6: jrc $25 ; PIC32MM: jrc +; PIC32MMR6: jrc ; STATIC32: j ; STATIC32MMR6: bc ; PIC64: jr $25 @@ -216,6 +224,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal @@ -235,6 +244,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal @@ -254,6 +264,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal @@ -275,6 +286,7 @@ entry: ; PIC32: jalr $25 ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal @@ -294,7 +306,8 @@ entry: ; ALL-LABEL: caller13: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalrs16 $25 +; PIC32MMR6: jalr $25 ; STATIC32: jal ; STATIC32MMR6: balc ; STATIC64: jal diff --git a/llvm/test/CodeGen/Mips/tls.ll b/llvm/test/CodeGen/Mips/tls.ll index 3ed6f6bd79ca3..76d54414a518f 100644 --- a/llvm/test/CodeGen/Mips/tls.ll +++ b/llvm/test/CodeGen/Mips/tls.ll @@ -28,10 +28,9 @@ entry: ; PIC64-DAG: lw $2, 0($2) ; MM-LABEL: f1: -; MM-DAG: addu $[[R0:[a-z0-9]+]], $2, $25 -; MM-DAG: addiu $4, $[[R0]], %tlsgd(t1) -; MM-DAG: lw $25, %call16(__tls_get_addr)($[[R0]]) -; MM-DAG: move $gp, $2 +; MM-DAG: addu $gp, $2, $25 +; MM-DAG: addiu $4, $gp, %tlsgd(t1) +; MM-DAG: lw $25, %call16(__tls_get_addr)($gp) ; MM-DAG: jalr $25 ; MM-DAG: lw16 $2, 0($2) } diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index ffaa9f6297ed8..5263e0d4f6f39 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1778,13 +1778,12 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1915,13 +1914,12 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index eaa1293ed2f98..536a1ae3b918d 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -1187,13 +1187,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; CHECK-NEXT: cmovbq %rcx, %rdx -; CHECK-NEXT: movq %rcx, %r14 +; CHECK-NEXT: movabsq $-9223372036854775808, %rbp # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %rbp, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rcx, %r15 +; CHECK-NEXT: movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovaq %r15, %rdx ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: movq $-1, %r13 @@ -1211,8 +1209,7 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rdx -; CHECK-NEXT: movq %r14, %rbp +; CHECK-NEXT: cmovbq %rbp, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx ; CHECK-NEXT: cmovaq %r13, %rax diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index ef85cd146d65f..5bce0bb5a60dc 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1713,8 +1713,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: mulps %xmm10, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 @@ -1776,8 +1776,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: mulps %xmm10, %xmm1 ; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 @@ -4236,8 +4235,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: vmovapd %ymm2, %ymm12 ; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vmovapd 144(%rbp), %ymm2 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm13 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm13 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm14 ; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 ; AVX1-NEXT: vmovapd %ymm1, %ymm9 @@ -4245,7 +4244,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm1 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 @@ -4263,14 +4262,12 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1-NEXT: vmovapd %ymm13, %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 ; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm2, %ymm13 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 @@ -4289,7 +4286,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload ; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 @@ -4637,8 +4634,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: vmovapd %ymm2, %ymm12 ; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: vmovapd 144(%rbp), %ymm2 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm13 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm13 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm14 ; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 ; AVX2-NEXT: vmovapd %ymm1, %ymm9 @@ -4646,7 +4643,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 @@ -4664,14 +4661,12 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX2-NEXT: vmovapd %ymm13, %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 ; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm2, %ymm13 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 @@ -4690,7 +4685,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload ; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index 5ffee3fa6bda4..e6e77f4e4eba7 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -362,11 +362,10 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %eax, %ebp -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %ecx, %esi ; X86-NEXT: movl $0, %ebx ; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl %ebx, %edi diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index 0299773aa67ad..bb93e34fda7c4 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,14 +10,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $400, %esp # imm = 0x190 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl 60(%eax), %ebp ; X86-NEXT: movl 56(%eax), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl (%ebx), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1552,10 +1551,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 56(%eax), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 56(%esi), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx @@ -4982,16 +4980,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rbx ; X64-NEXT: adcq %rsi, %r15 ; X64-NEXT: setb %sil ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rbp, %r13 +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r15, %rdi @@ -5121,10 +5118,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%r8), %rsi +; X64-NEXT: movq 40(%r8), %rbx ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r11, %rsi @@ -5259,29 +5255,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r8, %rbx ; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %r8b ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi @@ -5403,23 +5397,21 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 64(%r9), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 64(%r13), %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%r9), %rsi -; X64-NEXT: movq %r9, %r13 +; X64-NEXT: movq 72(%r13), %rsi ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r10 @@ -5650,35 +5642,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 80(%rdi), %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq 80(%r14), %r10 ; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%rdi), %r15 -; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq 88(%r14), %r15 ; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, %rbx +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r9, %rdi ; X64-NEXT: adcq %r8, %rcx ; X64-NEXT: setb %r8b ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %r11, %r10 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index 64f6746e616ed..2421aabdbcd99 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -672,10 +672,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 40(%eax), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 40(%edi), %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx @@ -1324,18 +1323,17 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 32(%rdi), %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 32(%r8), %r15 ; X64-NEXT: imulq %r15, %rsi ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq 40(%rdi), %rsi +; X64-NEXT: movq 40(%r8), %rsi ; X64-NEXT: imulq %rsi, %r10 ; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq 48(%rdi), %rax -; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: movq 48(%r8), %rax ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: imulq %r9, %rdi diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll index 56618205ec7c1..c6aeb72f4d974 100644 --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -37,10 +37,9 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm8 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss %xmm0, %xmm15, %xmm1 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmovaps %xmm5, %xmm15 ; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index ce0b212aa4c26..4925f8bc6c8b0 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -334,11 +334,10 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ebx diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index e61ba4923f792..f2627df3a98d8 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -171,9 +171,8 @@ define i64 @t6(i64 %key, ptr nocapture %val) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrdl $3, %eax, %ecx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shrdl $3, %esi, %ecx ; X86-NEXT: shrl $3, %esi ; X86-NEXT: movl (%edx), %eax ; X86-NEXT: movl 4(%edx), %edx diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index b2b5bcc5b44b2..816633b5b18ab 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -101,11 +101,10 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebx, %esi diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index af2f66d1e9bd0..384f8b832afb9 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -549,11 +549,10 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_estimate: ; SSE: # %bb.0: -; SSE-NEXT: rsqrtps %xmm0, %xmm5 +; SSE-NEXT: rsqrtps %xmm0, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; SSE-NEXT: addps %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 82603b35ba712..4c3170304b980 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -47,10 +47,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %esi, %eax diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 50cbb14d52427..22b5246443fa8 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -107,9 +107,8 @@ define <2 x i32> @smulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmuldq %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE41-NEXT: movq %xmm2, (%rdi) ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 @@ -441,9 +440,8 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm7, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: pxor %xmm6, %xmm6 @@ -516,9 +514,8 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm7, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: pxor %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1b30b0814330d..0cefc1c32d71b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1238,7 +1238,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 32(%rdi), %xmm7 @@ -1256,9 +1256,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] @@ -1290,7 +1289,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -1299,14 +1298,13 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa 128(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] @@ -2302,11 +2300,10 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 @@ -2323,11 +2320,10 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 352(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 @@ -2344,11 +2340,10 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 @@ -2725,12 +2720,11 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 368(%rdi), %xmm14 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX-NEXT: vmovdqa %xmm0, %xmm14 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm9 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7] @@ -2783,11 +2777,10 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm15 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] -; AVX-NEXT: vmovdqa %xmm0, %xmm15 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7] +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1] @@ -3446,9 +3439,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 @@ -3464,11 +3456,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX512-NEXT: vmovdqa 288(%rdi), %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] @@ -3613,9 +3603,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 @@ -3631,11 +3620,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] @@ -3780,9 +3767,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 @@ -3798,11 +3784,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] @@ -3947,9 +3931,8 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 @@ -3965,11 +3948,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index df28ac14a30c0..ae4f85ce42a19 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1918,10 +1918,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpackusdw %xmm7, %xmm15, %xmm7 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] -; AVX-NEXT: vmovdqa %xmm8, %xmm9 -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0],xmm6[1,2,3],xmm9[4],xmm6[5,6,7] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] @@ -2610,10 +2609,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -2636,10 +2634,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] @@ -3876,9 +3873,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX-NEXT: vmovdqa %xmm0, %xmm3 +; AVX-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm10[1,2,3],xmm3[4],xmm10[5,6,7] ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 0b44571acb6b2..68e92d7cf773f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1309,12 +1309,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] @@ -2408,7 +2407,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 @@ -2416,14 +2415,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm8 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] @@ -2453,9 +2451,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -2474,10 +2471,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 240(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] @@ -2899,22 +2895,18 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-LABEL: load_i16_stride5_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovdqa %xmm2, %xmm9 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm7 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; AVX-NEXT: vmovdqa %xmm3, %xmm11 +; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX-NEXT: vmovdqa %xmm3, %xmm10 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] @@ -2923,16 +2915,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX-NEXT: vmovdqa (%rdi), %xmm5 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; AVX-NEXT: vmovdqa %xmm4, %xmm15 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX-NEXT: vmovdqa %xmm6, %xmm12 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] @@ -2940,10 +2930,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps %xmm0, %xmm5 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -2979,10 +2968,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa %xmm2, %xmm14 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1 @@ -3233,7 +3221,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 @@ -3251,8 +3239,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] -; AVX2-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] ; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 @@ -3454,19 +3441,17 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] @@ -3489,8 +3474,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15] +; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm5 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm14 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] @@ -3673,22 +3658,18 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] @@ -3885,7 +3866,6 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] @@ -3897,9 +3877,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] ; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512-NEXT: vmovdqa64 176(%rdi), %xmm20 +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] @@ -3933,14 +3912,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm15 +; AVX512-NEXT: vpsrlq $48, %xmm20, %xmm15 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] @@ -4257,7 +4235,6 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] @@ -4269,9 +4246,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 176(%rdi), %xmm20 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] @@ -4305,14 +4281,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpsrlq $48, %xmm1, %xmm15 +; AVX512DQ-NEXT: vpsrlq $48, %xmm20, %xmm15 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] @@ -4954,10 +4929,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] @@ -5971,10 +5945,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps %xmm0, %xmm11 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 544(%rdi), %xmm11 +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -6562,7 +6535,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -6586,9 +6559,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] @@ -6608,10 +6580,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX2-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5],ymm4[6],ymm15[7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13],ymm4[14],ymm15[15] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 @@ -6621,10 +6592,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] -; AVX2-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] @@ -6694,23 +6664,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 624(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 624(%rdi), %xmm15 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm12 -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX2-NEXT: vmovdqa %xmm5, %xmm15 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm15[1],xmm12[2,3] +; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm10 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa 448(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX2-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX2-NEXT: vmovdqa 448(%rdi), %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm10[1],xmm8[2,3] +; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm13 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -7054,7 +7021,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 @@ -7064,9 +7031,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] @@ -7092,13 +7058,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 @@ -7171,9 +7135,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -7181,34 +7144,30 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm10 +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm11 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -7261,9 +7220,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -7533,7 +7491,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7564,18 +7522,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4],ymm14[5],ymm11[6,7],ymm14[8],ymm11[9,10],ymm14[11],ymm11[12],ymm14[13],ymm11[14,15] ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 @@ -7588,20 +7544,18 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -7649,38 +7603,33 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] @@ -7958,24 +7907,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i16_stride5_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa %ymm1, %ymm6 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -7987,10 +7932,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm14 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4],ymm3[5],ymm9[6,7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12],ymm3[13],ymm9[14,15] -; AVX512-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -8038,11 +7982,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512-NEXT: vmovdqa %xmm2, %xmm11 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -8105,9 +8047,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm21 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] -; AVX512-NEXT: vmovdqa %xmm1, %xmm7 +; AVX512-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] @@ -8492,12 +8433,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 @@ -8507,15 +8446,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4],ymm11[5],ymm5[6,7],ymm11[8],ymm5[9,10],ymm11[11],ymm5[12],ymm11[13],ymm5[14,15] -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 @@ -8892,24 +8829,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i16_stride5_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -8921,10 +8854,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm14 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4],ymm3[5],ymm9[6,7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12],ymm3[13],ymm9[14,15] -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -8972,11 +8904,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm11 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -9039,9 +8969,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm21 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm7 +; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] @@ -9426,12 +9355,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 @@ -9441,15 +9368,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4],ymm11[5],ymm5[6,7],ymm11[8],ymm5[9,10],ymm11[11],ymm5[12],ymm11[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index f7218adf85667..751412c77a59a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1864,9 +1864,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovdqa %xmm1, %xmm8 +; AVX-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpslld $16, %xmm1, %xmm2 @@ -1879,10 +1878,9 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX-NEXT: vpsrlq $16, %xmm7, %xmm10 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] -; AVX-NEXT: vmovdqa %xmm1, %xmm6 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] @@ -3707,15 +3705,14 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vmovdqa %xmm3, %xmm12 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 @@ -3726,8 +3723,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX-NEXT: vmovdqa %xmm4, %xmm10 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -3737,20 +3733,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX-NEXT: vpsrlq $16, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm14 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX-NEXT: vpsrlq $16, %xmm14, %xmm2 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vmovaps %ymm2, %ymm5 +; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 @@ -4224,10 +4218,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7] -; AVX2-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 @@ -4999,11 +4992,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512: # %bb.0: ; AVX512-NEXT: subq $72, %rsp ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-NEXT: vmovdqa %ymm1, %ymm11 -; AVX512-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm1 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] @@ -6552,13 +6543,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 @@ -9363,13 +9353,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9950,13 +9939,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10427,10 +10415,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vextracti32x4 $1, %ymm1, %xmm20 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -10454,9 +10441,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512-NEXT: vpshufb %xmm9, %xmm12, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm22 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] @@ -10685,9 +10671,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm29 & (zmm1 ^ zmm2)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm22 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm1)) ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11928,9 +11913,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm21 & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm18 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -12660,9 +12644,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm11 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 26e19566f048c..713bd757a7b99 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -2296,11 +2296,10 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm9, %xmm4 @@ -2421,9 +2420,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX-NEXT: vmovdqa %xmm2, %xmm8 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovdqa 208(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2438,9 +2436,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; AVX-NEXT: vmovdqa %xmm1, %xmm7 +; AVX-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7] ; AVX-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3] @@ -2448,17 +2445,15 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] -; AVX-NEXT: vmovdqa %xmm9, %xmm10 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,2,3,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero -; AVX-NEXT: vmovaps %xmm2, %xmm13 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm13[2],zero +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7] ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] @@ -4155,10 +4150,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm9 ; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 @@ -4973,10 +4967,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] -; AVX-NEXT: vmovdqa %xmm3, %xmm13 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 @@ -5081,9 +5074,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpsrld $16, %xmm13, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm4 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload ; AVX-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7] @@ -5346,10 +5338,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vmovaps %ymm5, %ymm13 +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] @@ -5504,10 +5495,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vmovaps %ymm5, %ymm11 +; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX-NEXT: vandps %ymm4, %ymm11, %ymm4 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload @@ -5575,22 +5565,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i16_stride7_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm11[1],ymm8[2,3,4],ymm11[5],ymm8[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] @@ -5603,12 +5590,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] @@ -5655,15 +5642,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %ymm9, %ymm3 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX2-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm14[2,3],ymm9[4,5],ymm14[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] ; AVX2-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] @@ -5683,10 +5668,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -5731,7 +5715,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3,4,5],ymm15[6],ymm12[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0 @@ -5743,7 +5727,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] @@ -5768,7 +5752,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] @@ -5794,9 +5778,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] -; AVX2-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -5805,8 +5788,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm7[2],mem[3,4],ymm7[5],mem[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] @@ -5878,8 +5861,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 @@ -5996,21 +5978,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] @@ -6023,12 +6002,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] @@ -6072,9 +6051,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] @@ -6101,9 +6079,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 @@ -6208,9 +6185,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -6403,30 +6379,24 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm14 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -6505,11 +6475,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm2 @@ -6517,22 +6485,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,0,2] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -6799,16 +6763,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] @@ -6828,9 +6790,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX512-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] -; AVX512-NEXT: vmovdqa %xmm6, %xmm13 +; AVX512-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 @@ -7457,17 +7418,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] @@ -7487,9 +7445,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] -; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm13 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 @@ -7502,7 +7459,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -7524,7 +7481,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -7548,8 +7505,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -10469,19 +10425,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm3, %xmm4 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload ; AVX-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX-NEXT: vmovaps %ymm1, %ymm8 +; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX-NEXT: vandps %ymm3, %ymm8, %ymm3 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] @@ -11573,9 +11527,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX2-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] @@ -11627,9 +11580,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 @@ -11638,8 +11591,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] -; AVX2-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] @@ -11735,11 +11687,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -11793,9 +11743,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -12501,7 +12450,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm6 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -12511,26 +12459,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -12544,7 +12491,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] @@ -12630,8 +12577,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] @@ -12647,8 +12594,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm5, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm5, %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] @@ -12666,16 +12613,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm13 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] @@ -12713,7 +12658,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 @@ -12854,9 +12799,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm4 -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] @@ -13261,18 +13205,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] @@ -13315,12 +13257,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm0 @@ -13472,9 +13413,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 @@ -13483,9 +13423,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm7 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] @@ -13498,31 +13437,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm0 @@ -13704,18 +13640,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 @@ -13828,9 +13762,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,7,3,6,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] @@ -14076,34 +14009,29 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vmovdqa %ymm4, %ymm10 -; AVX512-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512-NEXT: vmovdqa64 672(%rdi), %xmm22 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm22[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] -; AVX512-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7] ; AVX512-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 @@ -14120,8 +14048,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] -; AVX512-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -14392,18 +14319,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512-NEXT: vmovdqa %ymm2, %ymm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] @@ -14440,9 +14364,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] @@ -14973,15 +14896,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] @@ -15004,10 +14925,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2] ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 @@ -15298,9 +15218,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11 @@ -15314,9 +15234,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] @@ -15385,10 +15303,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7] -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11 @@ -15722,11 +15639,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm12 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 @@ -15737,17 +15653,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 @@ -15770,7 +15683,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -16040,18 +15953,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512DQ-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] @@ -16060,10 +15970,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 832(%rdi), %ymm9 ; AVX512DQ-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] @@ -16087,15 +15996,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7] +; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7,8,9,10,11,12,13],ymm12[14],ymm10[15] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1 @@ -16114,13 +16022,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm12[6,7] ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm15 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] @@ -16595,15 +16501,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7] @@ -16624,10 +16528,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 @@ -16640,8 +16543,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] @@ -16673,7 +16575,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16915,10 +16817,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 @@ -16932,8 +16833,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] @@ -17012,11 +16912,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 @@ -17074,9 +16972,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm25 & (zmm28 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 1b637cd203c8f..051b4e300b827 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -3928,9 +3928,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX-NEXT: vmovdqa %xmm2, %xmm6 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; AVX-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 320(%rdi), %xmm3 @@ -3943,10 +3942,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX-NEXT: vmovdqa %xmm2, %xmm7 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3 @@ -4025,10 +4023,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX-NEXT: vmovdqa %xmm1, %xmm9 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4136,16 +4133,14 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX-NEXT: vmovdqa %xmm1, %xmm8 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; AVX-NEXT: vmovdqa %xmm4, %xmm7 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] @@ -4370,9 +4365,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-NEXT: vpbroadcastd %xmm9, %xmm0 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -4434,9 +4428,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4831,9 +4824,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FP-NEXT: vpbroadcastd %xmm9, %xmm0 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -4895,9 +4887,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5292,9 +5283,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm0 ; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -5356,9 +5346,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5906,9 +5895,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] -; AVX512-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 -; AVX512-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0] +; AVX512-NEXT: vpermt2d %xmm11, %xmm6, %xmm7 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 @@ -6231,10 +6219,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6679,9 +6666,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] -; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0] +; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm6, %xmm7 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 @@ -7004,10 +6990,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -7807,7 +7792,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movdqa %xmm9, %xmm7 @@ -7826,9 +7810,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 656(%rdi), %xmm2 @@ -7842,9 +7825,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 592(%rdi), %xmm0 @@ -7858,9 +7840,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 528(%rdi), %xmm2 @@ -7889,9 +7870,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm2 @@ -7920,9 +7900,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 944(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 928(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 912(%rdi), %xmm2 @@ -7951,9 +7930,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm0 @@ -7983,9 +7961,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa 800(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm0 @@ -8013,11 +7990,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm6 @@ -8030,7 +8006,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa %xmm7, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8809,9 +8785,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; AVX-NEXT: vmovdqa %xmm3, %xmm9 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9009,10 +8984,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX-NEXT: vmovdqa %xmm1, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9236,8 +9210,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -9250,30 +9224,27 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; AVX-NEXT: vmovdqa %xmm4, %xmm8 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX-NEXT: vmovdqa %xmm2, %xmm11 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vmovdqa %xmm4, %xmm15 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -12690,7 +12661,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -12698,7 +12669,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2 @@ -12765,8 +12736,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 -; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vpermt2d %xmm1, %xmm6, %xmm2 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 @@ -13600,11 +13570,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 @@ -13832,11 +13801,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 +; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 -; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm13 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -13905,9 +13873,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] -; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm28 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] @@ -14007,10 +13974,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 @@ -14530,7 +14496,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -14538,7 +14504,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2 @@ -14605,8 +14571,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm6, %xmm2 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 @@ -15440,11 +15405,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 @@ -15672,11 +15636,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm13 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -15745,9 +15708,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm28 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] @@ -15847,10 +15809,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index afdeebc45ed0a..213c5febfca23 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -728,7 +728,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm12 ; SSE-NEXT: movaps 112(%rdi), %xmm13 ; SSE-NEXT: movaps 144(%rdi), %xmm11 ; SSE-NEXT: movaps 176(%rdi), %xmm10 @@ -759,9 +759,8 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] @@ -1203,7 +1202,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 208(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm6 +; SSE-NEXT: movaps 272(%rdi), %xmm10 ; SSE-NEXT: movaps 256(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1212,20 +1211,18 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 80(%rdi), %xmm12 ; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] @@ -2703,11 +2700,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] -; AVX-NEXT: vmovaps %ymm2, %ymm13 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2961,11 +2957,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 @@ -3191,11 +3186,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1 @@ -3395,10 +3389,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 @@ -3415,25 +3408,23 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 3874581e621b3..61f91b2bb0c0c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -1031,24 +1031,19 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: subq $264, %rsp # imm = 0x108 ; AVX-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX-NEXT: vmovaps %ymm3, %ymm14 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %ymm1, %ymm15 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX-NEXT: vmovaps %ymm2, %ymm10 -; AVX-NEXT: vmovaps %ymm1, %ymm3 +; AVX-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX-NEXT: vmovaps 224(%rdi), %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 176(%rdi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0] -; AVX-NEXT: vmovaps %xmm6, %xmm2 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm2[0],xmm1[0] +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX-NEXT: vmovaps 128(%rdi), %xmm6 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1056,9 +1051,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX-NEXT: vmovaps %ymm6, %ymm8 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] ; AVX-NEXT: vmovaps %ymm4, %ymm12 ; AVX-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3,0,1] @@ -1788,14 +1782,12 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm3 ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -2031,11 +2023,10 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] -; AVX-NEXT: vmovaps %ymm5, %ymm10 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 352(%rdi), %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2045,12 +2036,11 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX-NEXT: vmovaps 272(%rdi), %xmm9 ; AVX-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX-NEXT: vmovaps %xmm5, %xmm9 -; AVX-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX-NEXT: vmovaps %xmm9, (%rsp) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2076,18 +2066,16 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX-NEXT: vmovaps %ymm2, %ymm13 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3,0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX-NEXT: vmovaps %ymm3, %ymm11 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5] +; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2095,10 +2083,9 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vmovaps %xmm3, %xmm12 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -2116,14 +2103,13 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] ; AVX-NEXT: vmovaps (%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, %xmm7 +; AVX-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2317,18 +2303,17 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride4_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm12 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 272(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2353,9 +2338,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-NEXT: vmovaps %ymm6, %ymm12 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm2 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2386,8 +2370,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-NEXT: vmovaps %ymm7, %ymm6 +; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2546,18 +2529,17 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride4_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2582,9 +2564,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovaps %ymm6, %ymm12 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2615,8 +2596,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovaps %ymm7, %ymm6 +; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2775,18 +2755,17 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride4_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2811,9 +2790,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps %ymm6, %ymm12 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2844,8 +2822,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps %ymm7, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3717,9 +3694,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 976(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps 960(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -4067,21 +4043,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX-NEXT: vmovaps %ymm5, %ymm8 -; AVX-NEXT: vmovaps %ymm1, %ymm9 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] -; AVX-NEXT: vmovaps %ymm5, %ymm7 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3,0,1] +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 176(%rdi), %xmm5 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX-NEXT: vmovaps %xmm5, %xmm13 +; AVX-NEXT: vmovaps 176(%rdi), %xmm13 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX-NEXT: vmovaps 144(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 128(%rdi), %xmm12 @@ -4106,10 +4078,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX-NEXT: vmovaps %xmm4, %xmm6 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4670,7 +4641,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride4_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $1944, %rsp # imm = 0x798 -; AVX2-NEXT: vmovaps 704(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 @@ -4704,15 +4675,13 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 736(%rdi), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm3, %ymm11 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm1 -; AVX2-NEXT: vmovaps %ymm7, %ymm13 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm13, %ymm2, %ymm1 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4727,9 +4696,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-NEXT: vmovaps 992(%rdi), %ymm8 +; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm0 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 928(%rdi), %ymm4 @@ -4798,10 +4766,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm4 ; AVX2-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5163,7 +5130,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride4_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $1944, %rsp # imm = 0x798 -; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 @@ -5197,15 +5164,13 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm3, %ymm11 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovaps %ymm7, %ymm13 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm13, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5220,9 +5185,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm8 +; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm4 @@ -5291,10 +5255,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm0 +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm1, %ymm2, %ymm4 ; AVX2-FP-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5656,7 +5619,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride4_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1944, %rsp # imm = 0x798 -; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 @@ -5690,15 +5653,13 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovaps %ymm7, %ymm13 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5713,9 +5674,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm4 @@ -5784,10 +5744,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm4 ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index dd94dffa85932..d8d48b0b8c73d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -1300,15 +1300,14 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm9 ; SSE-NEXT: movdqa 160(%rdi), %xmm10 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1524,14 +1523,13 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX-NEXT: vmovaps 256(%rdi), %ymm7 ; AVX-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX-NEXT: vmovaps 32(%rdi), %ymm14 ; AVX-NEXT: vmovaps (%rdi), %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX-NEXT: vmovaps %ymm1, %ymm14 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm3 @@ -1543,9 +1541,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX-NEXT: vmovaps %ymm9, %ymm13 -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm4 @@ -2536,9 +2533,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm7 ; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -2564,13 +2560,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2594,12 +2588,10 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2691,10 +2683,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] @@ -3028,12 +3019,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vmovaps %ymm1, %ymm7 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 @@ -3059,10 +3049,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps %ymm1, %ymm15 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps (%rdi), %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 64(%rdi), %ymm11 @@ -3364,12 +3353,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] @@ -3402,11 +3390,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 @@ -3643,12 +3629,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] @@ -3681,11 +3666,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 @@ -3922,12 +3905,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] @@ -3960,11 +3942,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 @@ -5039,12 +5019,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1040(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1040(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5109,12 +5088,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5138,12 +5116,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 896(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 928(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5153,12 +5130,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 1200(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1248(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5184,10 +5160,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 480(%rdi), %xmm9 -; SSE-NEXT: movdqa 496(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -5199,12 +5174,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 800(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6191,9 +6165,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6326,8 +6300,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[2,0],ymm0[7,4],ymm4[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,1],ymm0[6,4],ymm4[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -6445,10 +6419,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] ; AVX-NEXT: vmovaps 1088(%rdi), %ymm11 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm1[0,0],ymm11[5,4],ymm1[4,4] -; AVX-NEXT: vmovaps %ymm1, %ymm3 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm3[0,0],ymm11[5,4],ymm3[4,4] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7051,10 +7024,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -7608,10 +7580,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm3 -; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -8165,10 +8136,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 8820dccc40bf4..e14a12d80f28d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -419,14 +419,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] @@ -874,11 +873,10 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 112(%rdi), %xmm3 ; SSE-NEXT: movdqa 64(%rdi), %xmm5 ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm7 @@ -1674,11 +1672,10 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 304(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 48(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -1711,18 +1708,16 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] @@ -1764,9 +1759,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] @@ -1809,8 +1803,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] @@ -1967,12 +1961,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdi), %ymm14 -; AVX-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm2[0,0],ymm3[6,4],ymm2[4,4] -; AVX-NEXT: vmovaps %ymm3, %ymm9 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7] ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm7 @@ -2177,12 +2170,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps %ymm10, %ymm5 -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2354,12 +2346,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3401,13 +3392,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 144(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm6 @@ -3434,12 +3423,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 432(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill @@ -3450,29 +3438,26 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 304(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 336(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 720(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4463,9 +4448,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4683,8 +4667,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm3, %ymm8, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload @@ -4701,8 +4685,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 80(%rdi), %xmm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-NEXT: vpermps %ymm14, %ymm8, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload @@ -4846,9 +4829,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm6 +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5066,8 +5048,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm3, %ymm8, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload @@ -5084,8 +5066,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FP-NEXT: vpermps %ymm14, %ymm8, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload @@ -5229,8 +5210,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5247,8 +5228,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm1 @@ -5430,8 +5410,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload @@ -5449,8 +5429,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload @@ -6765,12 +6744,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] @@ -6793,12 +6771,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6901,12 +6878,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1104(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6979,12 +6955,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1360(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 1392(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7285,10 +7260,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 896(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] @@ -10634,9 +10608,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index f616eafc24272..e03e19fc6d16f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -472,15 +472,14 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -1015,7 +1014,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 144(%rdi), %xmm9 ; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa 192(%rdi), %xmm8 ; SSE-NEXT: movdqa 160(%rdi), %xmm10 @@ -1032,8 +1031,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] @@ -2013,16 +2011,15 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa 192(%rdi), %xmm14 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 160(%rdi), %xmm12 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2051,10 +2048,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2088,9 +2084,8 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa 256(%rdi), %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] @@ -2165,9 +2160,8 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -4205,19 +4199,17 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 160(%rdi), %xmm12 ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] @@ -4272,13 +4264,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm3 +; SSE-NEXT: movdqa 864(%rdi), %xmm8 ; SSE-NEXT: movdqa 832(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm3 @@ -4423,11 +4414,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4920,10 +4910,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps (%rdi), %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm2, %xmm9 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rdi), %xmm9 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 160(%rdi), %xmm2 @@ -4931,10 +4920,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX-NEXT: vmovaps %xmm2, %xmm10 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4953,10 +4941,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX-NEXT: vmovaps %xmm2, %xmm4 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 640(%rdi), %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5195,10 +5182,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1,2],xmm5[3] ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,1],ymm13[1,3],ymm7[4,5],ymm13[5,7] -; AVX-NEXT: vmovaps %ymm7, %ymm14 -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm14[0,1],ymm13[1,3],ymm14[4,5],ymm13[5,7] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,2],ymm10[2,0],ymm6[4,6],ymm10[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5475,20 +5461,18 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5505,8 +5489,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5536,17 +5519,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 672(%rdi), %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm15 ; AVX2-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5566,10 +5547,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5605,10 +5585,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX2-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 736(%rdi), %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm15, %ymm5 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] @@ -5619,12 +5598,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 @@ -5734,9 +5711,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] ; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -5795,8 +5772,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 736(%rdi), %xmm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-NEXT: vpermps %ymm5, %ymm11, %ymm4 -; AVX2-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] @@ -5975,20 +5951,18 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -6005,8 +5979,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -6036,17 +6009,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm15 ; AVX2-FP-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -6066,10 +6037,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6105,10 +6075,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] @@ -6119,12 +6088,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 @@ -6234,9 +6201,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] ; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -6295,8 +6262,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm11, %ymm4 -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-FP-NEXT: vpermps %ymm14, %ymm11, %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] @@ -6472,13 +6438,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride7_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 @@ -6486,9 +6452,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6],ymm14[7] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -6521,8 +6486,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -6534,10 +6499,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -6585,7 +6549,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] @@ -6595,10 +6559,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -6614,13 +6577,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 @@ -9993,11 +9954,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps 224(%rdi), %xmm13 @@ -10021,9 +9982,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm2, %xmm3 +; AVX-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 832(%rdi), %xmm2 @@ -10201,7 +10161,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11192,14 +11152,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] @@ -11247,10 +11205,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1600(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 1568(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1568(%rdi), %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm3 @@ -11487,13 +11444,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -12222,14 +12177,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] @@ -12277,10 +12230,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm3 @@ -12517,13 +12469,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -13251,14 +13201,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] @@ -13307,10 +13255,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13546,13 +13493,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -14292,11 +14237,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14312,11 +14256,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14332,11 +14275,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14352,11 +14294,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14744,11 +14685,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14764,11 +14704,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14784,11 +14723,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -14804,11 +14742,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15196,11 +15133,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15216,11 +15152,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15236,11 +15171,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15256,11 +15190,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15648,11 +15581,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15668,11 +15600,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15688,11 +15619,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -15708,11 +15638,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16100,11 +16029,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16120,11 +16048,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16140,11 +16067,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16160,11 +16086,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16552,11 +16477,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16572,11 +16496,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16592,11 +16515,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -16612,11 +16534,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17004,11 +16925,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17024,11 +16944,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17044,11 +16963,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17064,11 +16982,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17456,11 +17373,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17476,11 +17392,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17496,11 +17411,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 @@ -17516,11 +17430,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 872a8d00cc234..cd0891385faff 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -2583,10 +2583,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride8_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-NEXT: vmovaps 288(%rdi), %xmm8 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-NEXT: vmovaps %xmm1, %xmm8 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm1, %xmm2 @@ -2612,25 +2611,22 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm6 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-NEXT: vmovaps %xmm0, %xmm6 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-NEXT: vmovaps %xmm0, %xmm4 -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-NEXT: vbroadcastss %xmm4, %xmm3 +; AVX2-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX2-NEXT: vbroadcastss %xmm12, %xmm11 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-NEXT: vmovaps %xmm0, %xmm5 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX2-NEXT: vbroadcastss %xmm5, %xmm2 +; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %xmm13 ; AVX2-NEXT: vbroadcastss %xmm13, %xmm3 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -2861,10 +2857,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride8_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FP-NEXT: vmovaps %xmm1, %xmm8 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm2 @@ -2890,25 +2885,22 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FP-NEXT: vmovaps %xmm0, %xmm6 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm0, %xmm4 -; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-FP-NEXT: vbroadcastss %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm11 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm0, %xmm5 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX2-FP-NEXT: vbroadcastss %xmm5, %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm13 ; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm3 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -3139,10 +3131,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride8_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FCP-NEXT: vmovaps %xmm1, %xmm8 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm2 @@ -3168,25 +3159,22 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm6 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm4 -; AVX2-FCP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm11 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm13 ; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm3 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -4598,12 +4586,10 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 208(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5189,17 +5175,14 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX-NEXT: vmovaps 416(%rdi), %ymm13 +; AVX-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX-NEXT: vmovaps 448(%rdi), %ymm9 ; AVX-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX-NEXT: vmovaps %ymm1, %ymm9 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX-NEXT: vmovaps %ymm3, %ymm5 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %ymm2, %ymm13 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5209,10 +5192,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX-NEXT: vmovaps %ymm4, %ymm11 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm2[0],ymm11[2],ymm2[2] +; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -5251,10 +5233,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX-NEXT: vmovaps %ymm4, %ymm10 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 736(%rdi), %ymm10 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 544(%rdi), %ymm0 @@ -5567,17 +5548,14 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-NEXT: vmovaps %xmm2, %xmm14 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX2-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-NEXT: vmovaps %xmm3, %xmm15 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX2-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX2-NEXT: vbroadcastss %xmm15, %xmm2 +; AVX2-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 @@ -6191,17 +6169,14 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FP-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm2, %xmm14 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm3, %xmm15 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 @@ -6815,17 +6790,14 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm14 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm3, %xmm15 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX2-FCP-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX2-FCP-NEXT: vbroadcastss %xmm15, %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 @@ -9663,9 +9635,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 176(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -10331,10 +10302,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX: # %bb.0: ; AVX-NEXT: subq $3720, %rsp # imm = 0xE88 ; AVX-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX-NEXT: vmovaps %xmm2, %xmm15 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 256(%rdi), %xmm15 +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 320(%rdi), %xmm2 @@ -10944,14 +10914,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vmovaps %ymm1, %ymm4 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX-NEXT: vmovaps %ymm3, %ymm15 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX-NEXT: vmovaps 256(%rdi), %ymm1 @@ -10961,9 +10929,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX-NEXT: vmovaps %ymm5, %ymm9 +; AVX-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -10971,12 +10938,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 672(%rdi), %ymm12 ; AVX-NEXT: vmovaps 640(%rdi), %ymm8 -; AVX-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vmovaps %ymm1, %ymm13 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps 544(%rdi), %ymm1 @@ -11716,9 +11682,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-NEXT: vmovaps %xmm2, %xmm9 +; AVX2-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 @@ -11730,9 +11695,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-NEXT: vmovaps %xmm3, %xmm13 +; AVX2-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-NEXT: vbroadcastss %xmm13, %xmm2 ; AVX2-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm3, %xmm3 @@ -13070,9 +13034,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm2, %xmm9 +; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 @@ -13084,9 +13047,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm3, %xmm13 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm2 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm3 @@ -14424,9 +14386,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm9 +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 @@ -14438,9 +14399,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm3, %xmm13 +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm2 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm3 @@ -15787,15 +15747,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15820,8 +15780,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15830,17 +15789,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16384,15 +16340,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16417,8 +16373,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16427,17 +16382,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16981,15 +16933,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17014,8 +16966,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17024,17 +16975,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17578,15 +17526,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17611,8 +17559,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17621,17 +17568,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18175,15 +18119,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18208,8 +18152,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18218,17 +18161,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18772,15 +18712,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18805,8 +18745,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18815,17 +18754,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19369,15 +19305,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19402,8 +19338,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19412,17 +19347,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19966,15 +19898,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19999,8 +19931,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20009,17 +19940,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index 4e9440140592e..94e91f546a9a3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -3398,16 +3398,14 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovapd 896(%rdi), %ymm0 ; AVX-NEXT: vmovapd 704(%rdi), %ymm1 ; AVX-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX-NEXT: vmovapd 320(%rdi), %ymm3 -; AVX-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] +; AVX-NEXT: vmovapd 320(%rdi), %ymm5 +; AVX-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm6[2,3] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd %ymm4, %ymm6 -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm5[2,3] ; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd %ymm3, %ymm5 -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd %ymm2, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 70164cff89072..0648d1b4abdf6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -4089,39 +4089,35 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 928(%rdi), %ymm10 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4143,10 +4139,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4476,39 +4471,35 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm10 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4530,10 +4521,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4863,39 +4853,35 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4917,10 +4903,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8742,59 +8727,53 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm8 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1008(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1328(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8808,10 +8787,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm14 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9564,59 +9542,53 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm8 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1008(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1328(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9630,10 +9602,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10386,59 +10357,53 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1008(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1328(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10452,10 +10417,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 21e1b17760c24..f82bcd1ce3e1e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -2508,15 +2508,14 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2537,10 +2536,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 704(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-NEXT: vmovaps %ymm0, %ymm10 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm0 @@ -2722,15 +2720,14 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2751,10 +2748,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps %ymm0, %ymm10 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm0 @@ -2936,15 +2932,14 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2965,10 +2960,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm10 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm0 @@ -5512,29 +5506,26 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX2-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 352(%rdi), %ymm14 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-NEXT: vmovaps %ymm5, %ymm14 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 736(%rdi), %ymm7 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-NEXT: vmovaps 1120(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6002,29 +5993,26 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm14 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6492,29 +6480,26 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm14 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12171,99 +12156,89 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 352(%rdi), %ymm15 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm15 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] -; AVX2-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 736(%rdi), %ymm4 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm4 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1312(%rdi), %ymm7 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 1504(%rdi), %ymm8 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-NEXT: vmovaps 1696(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 1696(%rdi), %ymm9 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm9 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 1888(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 1888(%rdi), %ymm10 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 2080(%rdi), %ymm11 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovaps %ymm2, %ymm11 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13239,99 +13214,89 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm15 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] -; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm4 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm7 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm9 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 2080(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovaps %ymm2, %ymm11 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14307,99 +14272,89 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm15 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] -; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm7 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 1d1da0954d675..c5be77db9ecf5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -7780,7 +7780,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26 @@ -7795,17 +7795,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 +; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8263,7 +8261,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 @@ -8278,17 +8276,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8746,7 +8742,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm26 @@ -8761,17 +8757,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9229,7 +9223,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 @@ -9244,17 +9238,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9739,8 +9731,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -9770,13 +9762,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -9799,8 +9791,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 @@ -10213,8 +10204,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10244,13 +10235,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm18, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -10273,8 +10264,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 @@ -10687,8 +10677,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10718,13 +10708,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -10747,8 +10737,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 @@ -11161,8 +11150,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -11192,13 +11181,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm18, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -11221,8 +11210,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 @@ -17370,11 +17358,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm16 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 @@ -17389,11 +17376,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6 @@ -17410,11 +17396,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm8 @@ -17429,12 +17414,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm10 @@ -18369,11 +18353,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 @@ -18388,11 +18371,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 @@ -18409,11 +18391,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 @@ -18428,12 +18409,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 @@ -19368,11 +19348,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 @@ -19387,11 +19366,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6 @@ -19408,11 +19386,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm8 @@ -19427,12 +19404,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm10 @@ -20367,11 +20343,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 @@ -20386,11 +20361,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 @@ -20407,11 +20381,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 @@ -20426,12 +20399,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 @@ -21367,12 +21339,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 @@ -21387,11 +21358,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 @@ -21740,16 +21710,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 @@ -21775,10 +21744,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 @@ -22349,12 +22318,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 @@ -22369,11 +22337,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 @@ -22722,16 +22689,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 @@ -22757,10 +22723,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 @@ -23331,12 +23297,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm1 @@ -23351,11 +23316,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm29 @@ -23704,16 +23668,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm29 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 @@ -23739,10 +23702,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 @@ -24313,12 +24276,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1 @@ -24333,11 +24295,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 @@ -24686,16 +24647,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 @@ -24721,10 +24681,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index ceb4948726760..51b6222077f82 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -8740,7 +8740,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8759,7 +8759,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -8819,8 +8819,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -8890,19 +8889,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -9261,7 +9258,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9280,7 +9277,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -9340,8 +9337,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -9411,19 +9407,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -9782,7 +9776,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9801,7 +9795,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -9861,8 +9855,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -9932,19 +9925,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -10303,7 +10294,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10322,7 +10313,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -10382,8 +10373,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -10453,19 +10443,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -10824,7 +10812,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10843,7 +10831,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -10903,8 +10891,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -10974,19 +10961,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -11345,7 +11330,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11364,7 +11349,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -11424,8 +11409,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -11495,19 +11479,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -11866,7 +11848,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11885,7 +11867,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -11945,8 +11927,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -12016,19 +11997,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -12387,7 +12366,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12406,7 +12385,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 @@ -12466,8 +12445,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] @@ -12537,19 +12515,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 @@ -19613,16 +19589,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -20684,16 +20659,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -21755,16 +21729,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -22826,16 +22799,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -23897,16 +23869,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -24968,16 +24939,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -26039,16 +26009,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 @@ -27110,16 +27079,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm2[1],zmm17[3],zmm2[3],zmm17[5],zmm2[5],zmm17[7],zmm2[7] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 15f6ef4006fdd..01181d4b21d9d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -2476,13 +2476,11 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX: # %bb.0: ; AVX-NEXT: subq $328, %rsp # imm = 0x148 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa %xmm2, %xmm6 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm8 +; AVX-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 @@ -2725,12 +2723,10 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm10 +; AVX2-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2899,12 +2895,10 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3073,12 +3067,10 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index db8c74f2741c8..e7bb02db62753 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -1119,7 +1119,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa 64(%rdi), %xmm9 ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] @@ -1143,8 +1143,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm15, %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: pand %xmm4, %xmm6 @@ -5210,45 +5209,38 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX: # %bb.0: ; AVX-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] -; AVX-NEXT: vmovdqa (%rdi), %xmm4 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, %xmm11 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rdi), %xmm8 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX-NEXT: vmovdqa %xmm4, %xmm8 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] ; AVX-NEXT: # xmm4 = mem[0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX-NEXT: vmovdqa %xmm5, %xmm9 -; AVX-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] ; AVX-NEXT: # xmm5 = mem[0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX-NEXT: vmovdqa %xmm7, %xmm12 -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, %xmm14 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vmovdqa %xmm1, %xmm13 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX-NEXT: vmovdqa %xmm1, %xmm10 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm4 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5277,11 +5269,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4 ; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 ; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, %xmm8 +; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] ; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vmovdqa 128(%rdi), %xmm13 @@ -5631,9 +5622,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] @@ -5642,9 +5632,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] @@ -5656,9 +5645,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] +; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] @@ -5670,9 +5658,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 @@ -5897,9 +5884,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] @@ -5908,9 +5894,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] @@ -5922,9 +5907,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] @@ -5936,9 +5920,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 @@ -6163,9 +6146,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] @@ -6174,9 +6156,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] @@ -6188,9 +6169,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] @@ -6202,9 +6182,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 2ca665d7981a9..9ce685f13e476 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -1591,8 +1591,8 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] @@ -1603,8 +1603,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7] @@ -2582,24 +2581,22 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pand %xmm10, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -2612,8 +2609,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm11, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] @@ -3204,8 +3200,8 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa (%rdi), %xmm9 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u] @@ -3214,7 +3210,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[5,11,u,u,u,u,u,u,u,u,u,u,u] @@ -3225,7 +3221,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] @@ -3235,9 +3231,8 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm8, %xmm5 -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] @@ -4681,9 +4676,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -4868,15 +4862,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 @@ -4886,8 +4879,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pandn %xmm13, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5180,16 +5173,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -5201,8 +5193,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5] @@ -5219,23 +5210,21 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm13 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] @@ -5247,7 +5236,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] @@ -5262,15 +5251,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -5957,7 +5946,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: subq $616, %rsp # imm = 0x268 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm8 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm9 @@ -5965,14 +5954,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 192(%rdi), %xmm11 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, %xmm13 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm1 -; AVX-NEXT: vmovdqa %xmm5, %xmm15 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm4, %xmm14 +; AVX-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm0 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] +; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm1 +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm0 ; AVX-NEXT: vmovdqa %xmm2, %xmm4 @@ -5989,17 +5975,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX-NEXT: vmovdqa %xmm2, %xmm13 +; AVX-NEXT: vmovd {{.*#+}} xmm13 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] ; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm15 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm2 ; AVX-NEXT: vmovdqa %xmm7, %xmm14 -; AVX-NEXT: vmovdqa %xmm3, %xmm15 ; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm3 ; AVX-NEXT: vmovdqa %xmm4, %xmm7 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -6014,16 +5998,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, %xmm12 -; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX-NEXT: vmovdqa %xmm2, %xmm13 +; AVX-NEXT: vmovq {{.*#+}} xmm12 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm13 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa %xmm2, %xmm4 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm0 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm2 @@ -6042,18 +6023,16 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm8 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX-NEXT: vmovdqa %xmm2, %xmm8 +; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa %xmm14, %xmm13 ; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX-NEXT: vmovdqa %xmm3, %xmm14 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm9 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm2 ; AVX-NEXT: vmovdqa %xmm6, %xmm10 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] @@ -6082,12 +6061,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX-NEXT: # xmm4 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX-NEXT: # xmm11 = mem[0,0] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm4, %xmm11 +; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero @@ -6126,11 +6104,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm10, %xmm10 ; AVX-NEXT: vmovdqa %ymm2, %ymm5 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX-NEXT: vandps %ymm7, %ymm9, %ymm9 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10 -; AVX-NEXT: vmovaps %ymm2, %ymm7 +; AVX-NEXT: vandnps %ymm10, %ymm7, %ymm10 ; AVX-NEXT: vorps %ymm10, %ymm9, %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 @@ -6163,10 +6140,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm0 -; AVX-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vmovdqa %xmm4, %xmm10 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 336(%rdi), %xmm10 +; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6371,11 +6347,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm15, %xmm8 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpblendvb %xmm13, %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX-NEXT: vmovaps %ymm6, %ymm7 +; AVX-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u] @@ -6435,11 +6410,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX-NEXT: vmovaps %ymm6, %ymm0 +; AVX-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index aa9a9f20645e2..bea6219b9fbac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -933,14 +933,13 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 @@ -956,9 +955,8 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 @@ -1867,13 +1865,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 64(%rdi), %xmm7 ; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -1891,8 +1888,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3621,9 +3617,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 208(%rdi), %xmm14 ; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 ; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 @@ -3649,9 +3645,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3678,9 +3673,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -3732,11 +3726,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm9 @@ -3758,11 +3751,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm2 @@ -4710,13 +4702,12 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vmovdqa (%rdi), %xmm10 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa %xmm3, %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] @@ -5020,9 +5011,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -5232,9 +5222,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -5432,16 +5421,15 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i8_stride7_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $40, %rsp -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] @@ -5461,17 +5449,15 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] @@ -9313,9 +9299,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm2, %xmm10 +; AVX-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0] @@ -9355,25 +9340,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: # xmm4 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm7 +; AVX-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX-NEXT: vmovdqa %xmm5, %xmm12 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm9 -; AVX-NEXT: vmovdqa %xmm6, %xmm14 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm9 ; AVX-NEXT: vpor %xmm1, %xmm9, %xmm9 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] ; AVX-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 400(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa %xmm2, %xmm9 +; AVX-NEXT: vmovdqa 400(%rdi), %xmm9 +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm6 ; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm2 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -9453,8 +9434,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm13 ; AVX-NEXT: vpor %xmm0, %xmm13, %xmm13 -; AVX-NEXT: vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm13, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0 @@ -9466,8 +9447,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm9, %xmm12 +; AVX-NEXT: vpblendvb %xmm12, %xmm14, %xmm2, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm2 @@ -9517,8 +9497,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -9543,8 +9523,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX-NEXT: vmovdqa %xmm14, %xmm15 +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX-NEXT: vpor %xmm0, %xmm4, %xmm5 @@ -9587,23 +9566,20 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm6, %xmm9 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX-NEXT: vmovdqa 192(%rdi), %xmm5 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX-NEXT: vmovdqa %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm13 -; AVX-NEXT: vmovdqa %xmm0, %xmm5 +; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX-NEXT: vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215] ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, %xmm4 +; AVX-NEXT: vmovdqa 432(%rdi), %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm0 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 @@ -9637,9 +9613,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX-NEXT: vmovdqa %xmm0, %xmm14 +; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm1 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 @@ -9694,11 +9669,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm7 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX-NEXT: vmovdqa %xmm2, %xmm6 +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm10 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9983,9 +9957,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX-NEXT: vmovdqa %xmm5, %xmm8 +; AVX-NEXT: vmovd {{.*#+}} xmm8 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm8, %xmm13, %xmm4 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] ; AVX-NEXT: # xmm4 = mem[0,0] @@ -10009,10 +9982,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX-NEXT: vandnps %ymm9, %ymm1, %ymm9 -; AVX-NEXT: vmovaps %ymm1, %ymm13 +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX-NEXT: vandnps %ymm9, %ymm13, %ymm9 ; AVX-NEXT: vorps %ymm5, %ymm9, %ymm5 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 @@ -10150,30 +10122,25 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 ; AVX2-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -10188,12 +10155,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 -; AVX2-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] @@ -10203,9 +10169,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 -; AVX2-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm7 @@ -10230,10 +10195,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] @@ -10244,11 +10208,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm10 +; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm10 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa %xmm6, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -10689,30 +10652,25 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 -; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -10727,12 +10685,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] @@ -10742,9 +10699,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm7 @@ -10769,10 +10725,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] @@ -10783,11 +10738,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm10 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -11228,16 +11182,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 @@ -11245,11 +11197,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -11264,14 +11214,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -11281,9 +11230,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 @@ -11306,15 +11254,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm10, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 @@ -11322,11 +11267,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -11334,18 +11278,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -11388,8 +11330,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm15 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm12 @@ -11427,8 +11369,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 +; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15 @@ -11526,8 +11469,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 @@ -11540,8 +11483,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 @@ -11795,12 +11737,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm22 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -11866,9 +11806,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm3) -; AVX512-NEXT: vmovdqa %ymm3, %ymm15 +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15) ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 @@ -12216,9 +12155,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm8 +; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 @@ -12617,12 +12555,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm3 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] @@ -12638,12 +12574,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm3 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm5 +; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -12710,9 +12644,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm3) -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27) ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15 ; AVX512DQ-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13048,11 +12981,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm1 & (ymm31 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm31 ^ ymm24)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] @@ -13063,9 +12995,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index b1eb4d6af4eb7..5b607748c5761 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -4741,10 +4741,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm3 -; AVX-NEXT: vmovdqa %xmm1, %xmm7 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm3 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 @@ -4798,10 +4797,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 144(%rdi), %xmm15 ; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX-NEXT: vmovdqa %xmm6, %xmm14 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -6076,29 +6074,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm15 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] @@ -6375,22 +6370,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 240(%rdi), %xmm12 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512-NEXT: vmovdqa %xmm4, %xmm7 +; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512-NEXT: vmovdqa %xmm5, %xmm9 -; AVX512-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX512-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX512-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm6 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -6400,9 +6391,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX512-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm6 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18 @@ -6418,7 +6408,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm10 ; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4 @@ -6436,14 +6426,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] ; AVX512-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512-NEXT: vmovdqa %xmm0, %xmm10 +; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm0 ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -6752,9 +6740,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] @@ -6967,22 +6954,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm12 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm9 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm6 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -6992,9 +6975,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm6 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18 @@ -7010,7 +6992,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm10 ; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm4 @@ -7028,14 +7010,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -7344,9 +7324,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] @@ -10498,10 +10477,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX-NEXT: vmovdqa %xmm4, %xmm15 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 336(%rdi), %xmm15 +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 @@ -11356,11 +11334,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX2-NEXT: vmovdqa 336(%rdi), %xmm15 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -11378,10 +11355,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm7 +; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -12250,11 +12226,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm15 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -12272,10 +12247,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm7 +; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -13814,19 +13788,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512-NEXT: vmovdqa 496(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa 496(%rdi), %xmm7 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm4 ; AVX512-NEXT: vmovdqa 448(%rdi), %xmm15 ; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm5 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -13841,9 +13812,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512-NEXT: vmovdqa 352(%rdi), %xmm5 -; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512-NEXT: vmovdqa %xmm5, %xmm14 +; AVX512-NEXT: vmovdqa 352(%rdi), %xmm14 +; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14565,27 +14535,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride8_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14605,14 +14572,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512-FCP-NEXT: vpmovqb %zmm28, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 ; AVX512-FCP-NEXT: movb $-64, %al @@ -14855,11 +14820,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] @@ -15130,19 +15094,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512DQ-NEXT: vmovdqa 496(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 496(%rdi), %xmm7 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm4 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm15 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm5 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -15157,9 +15118,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm14 +; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm14 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm4 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -15881,27 +15841,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15921,14 +15878,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm28, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: movb $-64, %al @@ -16171,11 +16126,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] @@ -16449,22 +16403,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512BW-NEXT: vmovdqa64 496(%rdi), %xmm24 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512BW-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512BW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3 +; AVX512BW-NEXT: vmovdqa64 480(%rdi), %xmm25 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa64 464(%rdi), %xmm26 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX512BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -16474,20 +16424,17 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa64 368(%rdi), %xmm31 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm31, %xmm4 +; AVX512BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vmovdqa64 352(%rdi), %xmm27 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 -; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512BW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512BW-NEXT: vmovdqa64 336(%rdi), %xmm22 +; AVX512BW-NEXT: vpshufb %xmm19, %xmm22, %xmm11 +; AVX512BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX512BW-NEXT: vpshufb %xmm19, %xmm9, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm11 @@ -16499,10 +16446,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm28 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 -; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 -; AVX512BW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa64 224(%rdi), %xmm18 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm18, %xmm10 +; AVX512BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-NEXT: vmovdqa64 208(%rdi), %xmm17 @@ -16520,12 +16466,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 -; AVX512BW-NEXT: vmovdqa %xmm0, %xmm10 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512BW-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm10, %xmm16 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -16543,8 +16487,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX512BW-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa64 416(%rdi), %xmm20 +; AVX512BW-NEXT: vmovdqa64 400(%rdi), %xmm21 +; AVX512BW-NEXT: vmovdqa64 416(%rdi), %xmm23 ; AVX512BW-NEXT: vmovdqa64 432(%rdi), %xmm29 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: vmovdqa64 %xmm24, %xmm19 @@ -16564,13 +16508,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 -; AVX512BW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 -; AVX512BW-NEXT: vmovdqa64 %xmm20, %xmm23 +; AVX512BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 -; AVX512BW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24 ; AVX512BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 @@ -17133,31 +17075,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm31, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm19, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 ; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] @@ -17214,12 +17152,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm31, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 @@ -17263,16 +17199,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm9, %ymm21 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm25 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] @@ -17313,9 +17246,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] @@ -17576,22 +17508,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 496(%rdi), %xmm24 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 480(%rdi), %xmm25 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 464(%rdi), %xmm26 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -17601,20 +17529,17 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 368(%rdi), %xmm31 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm31, %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %xmm27 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512DQ-BW-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 336(%rdi), %xmm22 +; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm22, %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm9, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] ; AVX512DQ-BW-NEXT: vpmovqb %zmm1, %xmm11 @@ -17626,10 +17551,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm28 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512DQ-BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 224(%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm18, %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa64 208(%rdi), %xmm17 @@ -17647,12 +17571,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm10, %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -17670,8 +17592,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 416(%rdi), %xmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 400(%rdi), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 416(%rdi), %xmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 432(%rdi), %xmm29 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, %xmm19 @@ -17691,13 +17613,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 -; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm20, %xmm23 +; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24 ; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 @@ -18260,31 +18180,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm31, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm19, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] @@ -18341,12 +18257,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm31, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 @@ -18390,16 +18304,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm9, %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] @@ -18440,9 +18351,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index a01c3da43c339..0beb304a5673d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1164,16 +1164,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa 48(%rsi), %xmm11 ; SSE-NEXT: movdqa 32(%rdx), %xmm10 ; SSE-NEXT: movdqa 48(%rdx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] @@ -2080,13 +2079,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm10 ; SSE-NEXT: movdqa 16(%rdx), %xmm9 ; SSE-NEXT: movdqa 32(%rdx), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 @@ -2165,10 +2163,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 @@ -2195,10 +2192,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm6 @@ -2225,10 +2221,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 80(%rdi), %xmm6 @@ -2255,10 +2250,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 96(%rdi), %xmm6 @@ -2283,10 +2277,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm1, %xmm13 ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa 112(%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 112(%rdi), %xmm8 @@ -3097,13 +3090,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i16_stride3_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa (%rsi), %xmm5 @@ -3249,13 +3240,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-LABEL: store_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 @@ -3401,13 +3390,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-LABEL: store_i16_stride3_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 @@ -3553,13 +3540,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-LABEL: store_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index aa449b36c2eb5..23290ea3f8f54 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -269,14 +269,13 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride5_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero @@ -2744,10 +2743,9 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vmovaps %ymm3, %ymm12 +; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm3 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] @@ -6756,9 +6754,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX512-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm0 ; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm17 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] @@ -7145,9 +7142,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5 @@ -7180,9 +7176,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] @@ -7389,9 +7384,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm17 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] @@ -7778,9 +7772,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5 @@ -7813,9 +7806,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 93262fb657d49..1073a35353a63 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -1307,9 +1307,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa (%r8), %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: andnps %xmm1, %xmm0 @@ -1722,13 +1721,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm4 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] @@ -2570,11 +2568,10 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm11 @@ -2598,11 +2595,10 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] ; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] @@ -3105,7 +3101,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -3133,9 +3129,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] -; AVX2-NEXT: vmovdqa %xmm6, %xmm14 -; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,1] +; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-NEXT: vmovdqa 32(%r9), %xmm4 @@ -3226,9 +3221,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vpblendvb %ymm3, %ymm6, %ymm14, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] @@ -3246,7 +3240,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -3377,7 +3371,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 @@ -3410,9 +3404,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -3421,19 +3414,17 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm9 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm14 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3655,12 +3646,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] @@ -3688,31 +3679,27 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm12 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm8 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FCP-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3787,9 +3774,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5232,15 +5218,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm6, %xmm0 @@ -5275,15 +5259,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm8, %xmm0 @@ -5321,10 +5303,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 48(%r8), %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm11, %xmm0 @@ -5362,10 +5343,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 64(%r8), %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1] -; SSE-NEXT: movdqa 64(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm12, %xmm0 @@ -5403,10 +5383,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 80(%r8), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1] -; SSE-NEXT: movdqa 80(%r9), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm15, %xmm0 @@ -5510,11 +5489,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: orps %xmm0, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm13 @@ -5538,11 +5516,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] @@ -5564,12 +5541,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm2 @@ -5592,11 +5568,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] @@ -5618,12 +5593,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm2 @@ -5669,11 +5643,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -5718,11 +5691,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -5767,11 +5739,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -5816,11 +5787,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -5834,21 +5804,18 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: pandn %xmm2, %xmm9 ; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 @@ -5864,12 +5831,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] @@ -6605,18 +6571,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa %xmm2, %xmm11 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] -; AVX2-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 @@ -6643,9 +6607,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; AVX2-NEXT: vmovdqa %xmm7, %xmm13 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,2,1] +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] ; AVX2-NEXT: vmovdqa %xmm8, %xmm5 @@ -6764,10 +6727,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm15 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7] @@ -7202,7 +7164,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm6 @@ -7230,9 +7192,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm11 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7293,11 +7254,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm14 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm9 @@ -7324,11 +7284,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm15 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7938,9 +7897,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 @@ -8399,9 +8357,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %ymm7, %ymm29 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] @@ -8479,9 +8436,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] ; AVX512-NEXT: vmovdqa (%r8), %xmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX512-NEXT: vmovdqa %xmm2, %xmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpshufb %xmm11, %xmm8, %xmm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -8850,9 +8806,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm25, %ymm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8935,9 +8890,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9504,10 +9458,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 @@ -9902,13 +9855,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm29, %zmm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm0 @@ -9967,10 +9918,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm2 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 6e251f8675bea..f135b2f1577ec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -1709,7 +1709,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rdx), %xmm15 ; SSE-NEXT: movdqa 16(%rcx), %xmm1 ; SSE-NEXT: movdqa 16(%r8), %xmm8 @@ -1726,10 +1726,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1782,9 +1781,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa (%rax), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -2291,7 +2289,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-NEXT: vmovdqa (%rdx), %ymm5 ; AVX2-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2312,8 +2310,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] -; AVX2-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] @@ -2487,11 +2484,11 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm13 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] @@ -2500,17 +2497,15 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm8, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] @@ -2577,8 +2572,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] @@ -3564,21 +3559,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm1 -; SSE-NEXT: movdqa 48(%rcx), %xmm5 +; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: movdqa 48(%rcx), %xmm11 ; SSE-NEXT: movdqa 48(%r8), %xmm9 ; SSE-NEXT: movdqa 48(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -3628,19 +3621,17 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa (%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm13 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3716,9 +3707,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa 16(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] ; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 @@ -3802,9 +3792,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa 32(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 @@ -4568,14 +4557,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpsrld $16, %xmm13, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm4 -; AVX-NEXT: vmovdqa %xmm13, %xmm2 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -4637,10 +4624,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] -; AVX-NEXT: vmovdqa %xmm8, %xmm10 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3 @@ -4698,10 +4684,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] -; AVX-NEXT: vmovdqa %xmm1, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 @@ -4794,7 +4779,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm10 ; AVX2-NEXT: vmovdqa (%rdx), %ymm12 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9 @@ -4807,8 +4792,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0] -; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm2, %ymm4 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] @@ -4841,10 +4825,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,3,0,0,0,4] -; AVX2-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX2-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm4, %ymm5 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm5 @@ -5177,7 +5160,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm12 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm15 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10 @@ -5189,8 +5172,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,3,0,0,0,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm5 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FP-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 @@ -5220,10 +5202,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,3,0,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm5 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm5 @@ -5286,14 +5267,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm11 @@ -5533,23 +5512,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $312, %rsp # imm = 0x138 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm8 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] @@ -5736,20 +5711,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] @@ -6067,10 +6040,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm14 -; AVX512-NEXT: vmovdqa %xmm1, %xmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm14 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -6234,11 +6206,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-LABEL: store_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm3 @@ -6246,12 +6217,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm4 @@ -6311,11 +6281,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm21 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] @@ -6695,10 +6664,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm14 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm14 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -6862,11 +6830,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm3 @@ -6874,12 +6841,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm4 @@ -6939,11 +6905,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] @@ -7721,22 +7686,21 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdx), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm1 +; SSE-NEXT: movdqa 112(%rdx), %xmm10 ; SSE-NEXT: movdqa 96(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm6 -; SSE-NEXT: movdqa 112(%r8), %xmm4 +; SSE-NEXT: movdqa 112(%rcx), %xmm11 +; SSE-NEXT: movdqa 112(%r8), %xmm9 ; SSE-NEXT: movdqa 112(%r9), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -7746,9 +7710,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm0 @@ -7800,7 +7763,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm5, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] @@ -7908,19 +7871,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa 16(%rax), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 16(%r8), %xmm14 -; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa 16(%r9), %xmm13 ; SSE-NEXT: movdqa %xmm14, %xmm5 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -7998,12 +7959,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 32(%r8), %xmm14 -; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: movdqa 32(%r9), %xmm13 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8082,9 +8042,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa 48(%rax), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -8113,13 +8072,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm2, %xmm0 @@ -8143,7 +8102,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -8172,19 +8131,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa 64(%rax), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm2 +; SSE-NEXT: movdqa 64(%r9), %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8272,13 +8229,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 80(%r8), %xmm1 -; SSE-NEXT: movdqa 80(%r9), %xmm2 +; SSE-NEXT: movdqa 80(%r9), %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -8607,10 +8563,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm15, %xmm1 @@ -9316,10 +9271,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX-NEXT: vandps %ymm13, %ymm12, %ymm12 -; AVX-NEXT: vmovaps %ymm13, %ymm15 +; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11 +; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12 ; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] @@ -9406,15 +9360,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rdx), %xmm2 -; AVX-NEXT: vmovdqa (%rcx), %xmm3 -; AVX-NEXT: vpsrld $16, %xmm3, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vmovdqa %xmm3, %xmm9 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm2, %xmm10 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rdx), %xmm10 +; AVX-NEXT: vmovdqa (%rcx), %xmm9 +; AVX-NEXT: vpsrld $16, %xmm9, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 ; AVX-NEXT: vmovdqa (%rsi), %xmm1 @@ -9476,38 +9428,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX-NEXT: vpsrld $16, %xmm6, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm6 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX-NEXT: vpsrld $16, %xmm3, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX-NEXT: vmovdqa %xmm7, %xmm9 -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm3, %xmm10 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX-NEXT: vpsrld $16, %xmm10, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX-NEXT: vmovdqa 16(%r8), %xmm7 ; AVX-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vmovdqa %xmm2, %xmm7 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm8 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] @@ -9552,15 +9499,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX-NEXT: vpsrld $16, %xmm5, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX-NEXT: vmovdqa %xmm5, %xmm13 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm2, %xmm15 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX-NEXT: vpsrld $16, %xmm13, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 @@ -9619,24 +9564,21 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX-NEXT: vpsrld $16, %xmm8, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm8 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX-NEXT: vmovdqa %xmm9, %xmm13 -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm7, %xmm3 -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -9644,14 +9586,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX-NEXT: vmovdqa 48(%r9), %xmm10 +; AVX-NEXT: vmovdqa 48(%r8), %xmm9 ; AVX-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vmovdqa %xmm2, %xmm9 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm1, %xmm10 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm7, %xmm2 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] @@ -9773,28 +9713,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6] ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX-NEXT: vpsrld $16, %xmm3, %xmm7 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vmovdqa %xmm3, %xmm12 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm2, %xmm8 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX-NEXT: vpsrld $16, %xmm12, %xmm7 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX-NEXT: vandps %ymm4, %ymm7, %ymm7 ; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7 -; AVX-NEXT: vmovdqa 80(%r9), %xmm0 -; AVX-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX-NEXT: vmovdqa %xmm2, %xmm3 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm0, %xmm4 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 80(%r9), %xmm4 +; AVX-NEXT: vmovdqa 80(%r8), %xmm3 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 80(%rax), %xmm2 ; AVX-NEXT: vmovdqa %xmm6, %xmm0 ; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -9840,21 +9776,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpsrld $16, %xmm13, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpsrld $16, %xmm8, %xmm6 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX-NEXT: vmovdqa %xmm13, %xmm8 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 @@ -9908,24 +9842,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] -; AVX-NEXT: vmovdqa %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 ; AVX-NEXT: vandnps %ymm6, %ymm10, %ymm6 ; AVX-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX-NEXT: vmovdqa %xmm13, %xmm0 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] @@ -9960,9 +9892,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpsrld $16, %xmm0, %xmm6 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa %xmm0, %xmm6 +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12 @@ -10053,8 +9984,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpsrld $16, %xmm0, %xmm14 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 ; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12 ; AVX-NEXT: vandnps %ymm5, %ymm8, %ymm5 @@ -10084,8 +10015,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7] -; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX-NEXT: vmovdqa %xmm3, %xmm15 +; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm12 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 ; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -10103,20 +10033,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX-NEXT: vandps %ymm10, %ymm12, %ymm12 ; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] -; AVX-NEXT: vmovdqa %xmm11, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] @@ -10297,9 +10225,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rax), %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,0,4] -; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,3,0,4] +; AVX2-NEXT: vpermd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] @@ -10326,26 +10253,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,0,4,0,0,4] -; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] -; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,0,4,0,0] +; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10387,18 +10310,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqa 96(%rdx), %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] @@ -10407,22 +10327,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] -; AVX2-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa 96(%rax), %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-NEXT: vmovdqa 96(%rax), %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -10486,9 +10403,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm1, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] ; AVX2-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10735,8 +10651,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] @@ -10744,15 +10660,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 40(%rax), %ymm7 ; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 72(%rax), %ymm8 ; AVX2-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7 @@ -10892,10 +10807,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm5 +; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm5 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] @@ -10911,8 +10826,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm6 -; AVX2-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-NEXT: vpshufb %ymm9, %ymm5, %ymm6 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] @@ -11316,9 +11230,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 @@ -11547,8 +11460,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] @@ -11556,15 +11469,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 40(%rax), %ymm7 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 72(%rax), %ymm8 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7 @@ -11598,8 +11510,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm5 @@ -11613,9 +11525,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm12, %ymm6 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -11669,9 +11580,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] @@ -11687,8 +11598,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm8 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] @@ -11913,28 +11823,23 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,0,2,1,4,4,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm12 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] @@ -11958,10 +11863,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 @@ -11973,9 +11877,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12127,10 +12030,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm14 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] @@ -12176,28 +12078,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,0,4,0,0,4] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,0,4,0,0,4] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,0,0,0,4,0,0] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,3,0,0,0,4,0,0] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm5 @@ -12207,15 +12107,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,3,0,4] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,3,0,4] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 @@ -12313,22 +12212,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] @@ -12341,12 +12237,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -12550,8 +12445,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] @@ -12559,15 +12454,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 72(%rax), %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 @@ -12816,9 +12710,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm17 ; AVX512-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vmovdqa %xmm7, %xmm15 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm15, %xmm6, %xmm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9 @@ -12957,14 +12850,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13004,9 +12896,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX512-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13026,8 +12917,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0)) ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] @@ -13085,40 +12975,37 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vmovdqa %ymm2, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm0 -; AVX512-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512-NEXT: vprold $16, %ymm7, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -13502,8 +13389,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm27 & (zmm1 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm11 & (zmm1 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -13555,7 +13442,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm27 & (zmm0 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm11 & (zmm0 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -13615,9 +13502,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm14 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -13625,9 +13511,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,1,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 @@ -13708,12 +13593,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm5 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 @@ -14152,9 +14036,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm17 ; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm15 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm9 @@ -14293,14 +14176,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14340,9 +14222,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14362,8 +14243,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0)) ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm9, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] @@ -14421,40 +14301,37 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512DQ-NEXT: vprold $16, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -14838,8 +14715,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm27 & (zmm1 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm11 & (zmm1 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -14891,7 +14768,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm27 & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm11 & (zmm0 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -14951,9 +14828,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm14 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -14961,9 +14837,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 @@ -15044,12 +14919,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 64f5761b31d64..ccd2d58702de0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -1803,7 +1803,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX-NEXT: vmovdqa 16(%r9), %xmm9 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] @@ -1838,8 +1838,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX-NEXT: vmovdqa %xmm7, %xmm9 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3850,9 +3849,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,1] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,0,1] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm6 @@ -3974,12 +3972,10 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm2, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7407,10 +7403,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,2,2,0,0,3,3] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm15, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm3 @@ -7464,10 +7459,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,2,2,2,0,0,3,3] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm14, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm2 @@ -7519,10 +7513,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,2,2,2,0,0,3,3] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm13, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm13, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm15, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm5 @@ -9313,26 +9306,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 @@ -9607,26 +9595,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 @@ -9901,26 +9884,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 @@ -10195,26 +10173,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index de2e1df4c5566..7037a2864654f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -1202,14 +1202,13 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rsi), %xmm13 ; SSE-NEXT: movaps 32(%rsi), %xmm12 ; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm14 ; SSE-NEXT: movaps 32(%rdx), %xmm7 ; SSE-NEXT: movaps 48(%rdx), %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[0,3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] @@ -1222,7 +1221,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[0,3] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -1231,8 +1230,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm14[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -1264,11 +1262,10 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm12 ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] @@ -2139,14 +2136,13 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rsi), %xmm11 ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm12 ; SSE-NEXT: movaps 16(%rdx), %xmm8 ; SSE-NEXT: movaps 32(%rdx), %xmm9 ; SSE-NEXT: movaps 48(%rdx), %xmm10 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] @@ -2201,11 +2197,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2219,11 +2214,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2237,11 +2231,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2255,11 +2248,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2273,11 +2265,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm1 +; SSE-NEXT: movaps 128(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2291,11 +2282,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps 144(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2309,11 +2299,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps 160(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2326,11 +2315,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm12 -; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps 176(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2343,11 +2331,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps 192(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 58991d65cf1ee..b6914ec197300 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -3695,10 +3695,9 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm8 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm8, %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6435,10 +6434,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovaps %ymm13, %ymm2 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps (%rdi), %ymm3 @@ -6462,13 +6460,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6481,7 +6478,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] @@ -6491,13 +6488,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6510,7 +6506,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] @@ -6520,13 +6516,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-NEXT: vmovaps 96(%rcx), %ymm14 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovaps %ymm14, %ymm4 -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6539,7 +6534,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6548,14 +6543,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdx), %ymm13 -; AVX2-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm13, %ymm4 -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6577,12 +6570,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-NEXT: vmovaps 160(%rdx), %ymm2 ; AVX2-NEXT: vmovaps 160(%rcx), %ymm12 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm9 @@ -7025,10 +7017,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovaps %ymm13, %ymm2 -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 @@ -7052,13 +7043,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7071,7 +7061,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] @@ -7081,13 +7071,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps %ymm13, %ymm3 -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7100,7 +7089,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] @@ -7110,13 +7099,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm14 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovaps %ymm14, %ymm4 -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7129,7 +7117,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -7138,14 +7126,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm13 -; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps %ymm13, %ymm4 -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7167,12 +7153,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm2 ; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm12 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm9 @@ -7557,12 +7542,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm11 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm15 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7593,14 +7577,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7634,10 +7616,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm5 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7674,14 +7655,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7715,14 +7694,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm13 -; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7757,10 +7734,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index e4f616ed730eb..c511c131193f7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -3644,11 +3644,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero -; AVX2-NEXT: vmovdqa %xmm10, %xmm11 -; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero +; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -3661,9 +3660,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vmovdqa %xmm6, %xmm12 -; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero +; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -3680,10 +3678,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero -; AVX2-NEXT: vmovdqa %xmm8, %xmm15 -; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 64(%r8), %xmm15 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero +; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -4031,11 +4028,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm11 -; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -4048,9 +4044,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm12 -; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -4067,10 +4062,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm15 +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -6588,7 +6582,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdx), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rcx), %ymm5 +; AVX-NEXT: vmovaps (%rcx), %ymm15 ; AVX-NEXT: vmovaps (%r8), %ymm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rcx), %xmm1 @@ -6617,9 +6611,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX-NEXT: vmovaps %ymm5, %ymm15 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -9847,10 +9840,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 @@ -9858,7 +9851,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9867,8 +9860,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10254,9 +10246,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 @@ -10416,10 +10407,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 @@ -10427,7 +10418,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10436,8 +10427,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10823,9 +10813,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 @@ -10985,10 +10974,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 @@ -10996,7 +10985,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11005,8 +10994,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11392,9 +11380,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 @@ -11554,10 +11541,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 @@ -11565,7 +11552,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11574,8 +11561,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11961,9 +11947,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 13930bc2c6740..6476c3139daa7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -866,17 +866,16 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm14 ; SSE-NEXT: movdqa 16(%rdx), %xmm6 ; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa 16(%r8), %xmm11 ; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movaps 16(%r9), %xmm15 ; SSE-NEXT: movdqa (%rax), %xmm10 ; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm15[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] @@ -885,17 +884,15 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1781,7 +1778,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm12 ; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1799,14 +1796,13 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] @@ -1832,12 +1828,10 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2071,20 +2065,17 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX: # %bb.0: ; AVX-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX-NEXT: vmovaps 32(%r8), %ymm0 ; AVX-NEXT: vmovaps 32(%r9), %ymm1 ; AVX-NEXT: vmovaps 32(%rax), %ymm2 -; AVX-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX-NEXT: vmovaps %ymm7, %ymm8 -; AVX-NEXT: vmovaps %ymm4, %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] -; AVX-NEXT: vmovaps %ymm5, %ymm9 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] @@ -2097,18 +2088,16 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%r8), %xmm11 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm4[0] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX-NEXT: vmovaps %xmm4, %xmm13 -; AVX-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm13[0] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm13[2,1] +; AVX-NEXT: vmovaps %xmm13, (%rsp) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX-NEXT: vmovaps %xmm14, %xmm15 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm5, %xmm14 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] @@ -2152,11 +2141,10 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1],xmm0[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps (%rcx), %xmm2 +; AVX-NEXT: vmovaps (%rcx), %xmm7 ; AVX-NEXT: vmovaps (%rdx), %xmm12 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm2[1],zero -; AVX-NEXT: vmovaps %xmm2, %xmm7 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm7[1],zero +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps (%r9), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2485,8 +2473,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2504,9 +2492,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] ; AVX2-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-NEXT: vmovaps %ymm4, %ymm13 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] @@ -2737,8 +2724,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2756,9 +2743,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] ; AVX2-FP-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm4, %ymm13 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] @@ -2829,9 +2815,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps (%r9), %xmm7 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FCP-NEXT: vmovaps %xmm4, %xmm14 +; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm14 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -4064,13 +4049,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4118,13 +4101,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa 64(%rsi), %xmm12 +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4145,12 +4126,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm3 +; SSE-NEXT: movdqa 80(%rsi), %xmm12 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4168,15 +4149,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm6 -; SSE-NEXT: movaps 96(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm3 ; SSE-NEXT: movaps 96(%r8), %xmm4 @@ -4187,10 +4167,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm2 -; SSE-NEXT: movdqa 96(%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rax), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -4657,12 +4636,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rdi), %xmm2 +; AVX-NEXT: vmovaps (%rdi), %xmm6 ; AVX-NEXT: vmovaps (%rsi), %xmm5 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm2[0] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] -; AVX-NEXT: vmovaps %xmm2, %xmm6 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmovaps (%rcx), %xmm8 ; AVX-NEXT: vmovaps (%rdx), %xmm9 @@ -4711,20 +4689,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm5 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, %xmm6 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm2, %xmm9 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%r9), %xmm3 @@ -4775,20 +4750,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX-NEXT: vmovaps 64(%rsi), %xmm7 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm6 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX-NEXT: vmovaps %xmm4, %xmm9 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm2, %xmm10 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%r9), %xmm4 @@ -4820,11 +4792,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] ; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm9[1,1],ymm2[5,5],ymm9[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm9[1,1],ymm13[5,5],ymm9[5,5] ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %ymm2, %ymm13 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 64(%r8), %ymm7 ; AVX-NEXT: vmovaps 64(%r9), %ymm10 @@ -4889,10 +4860,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX-NEXT: vmovaps %ymm1, %ymm6 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -6288,10 +6258,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm7 -; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm7 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-FCP-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 @@ -6325,14 +6294,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm14 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm8 +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] @@ -8433,13 +8400,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8486,13 +8451,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movdqa 64(%rsi), %xmm2 +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8539,13 +8502,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm1 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movdqa 96(%rsi), %xmm2 +; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm10 ; SSE-NEXT: movaps 96(%r8), %xmm0 @@ -8591,13 +8552,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rsi), %xmm1 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movdqa 128(%rsi), %xmm2 +; SSE-NEXT: movaps 128(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 128(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8644,13 +8603,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rsi), %xmm1 -; SSE-NEXT: movaps 160(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movdqa 160(%rsi), %xmm2 +; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 160(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8697,13 +8654,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rsi), %xmm1 -; SSE-NEXT: movaps 192(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movdqa 192(%rsi), %xmm2 +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 192(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8731,20 +8686,18 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm6 +; SSE-NEXT: movaps 208(%rcx), %xmm2 ; SSE-NEXT: movaps 208(%r8), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%r9), %xmm6 -; SSE-NEXT: movdqa 208(%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -9676,12 +9629,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovaps (%rdi), %xmm6 ; AVX-NEXT: vmovaps (%rsi), %xmm5 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm6 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps (%rcx), %xmm7 ; AVX-NEXT: vmovaps (%rdx), %xmm8 @@ -9731,20 +9683,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm9 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm9[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,1] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, %xmm10 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm2, %xmm12 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%r9), %xmm3 @@ -9795,18 +9744,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vmovaps 64(%rdi), %xmm7 ; AVX-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm7 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm7[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX-NEXT: vmovaps 64(%rcx), %xmm9 ; AVX-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, %xmm9 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -9858,20 +9805,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm5 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm5[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX-NEXT: vmovaps %xmm4, %xmm7 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm2, %xmm9 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX-NEXT: vmovaps 96(%rdx), %xmm9 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 96(%r9), %xmm4 @@ -9921,20 +9865,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX-NEXT: vmovaps 128(%rdi), %xmm5 ; AVX-NEXT: vmovaps 128(%rsi), %xmm4 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm5 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 128(%rcx), %xmm3 -; AVX-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, %xmm7 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm2, %xmm9 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%rcx), %xmm7 +; AVX-NEXT: vmovaps 128(%rdx), %xmm9 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 128(%r9), %xmm3 @@ -9985,12 +9926,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX-NEXT: vmovaps 160(%rdi), %xmm11 ; AVX-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm11 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm11[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 160(%rcx), %xmm7 ; AVX-NEXT: vmovaps 160(%rdx), %xmm3 @@ -10046,12 +9986,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX-NEXT: vmovaps %xmm1, %xmm12 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm12[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,1] +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX-NEXT: vmovaps 192(%rdx), %xmm6 @@ -13191,12 +13130,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm12 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm10 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm8 @@ -13946,26 +13884,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -14422,26 +14356,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -14898,26 +14828,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -15374,26 +15300,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -15850,26 +15772,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -16326,26 +16244,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -16802,26 +16716,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 @@ -17278,26 +17188,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 265f6daeb2003..bd6c527e06b14 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -2379,18 +2379,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-NEXT: vmovaps (%rcx), %xmm3 ; AVX2-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-NEXT: vmovaps %xmm2, %xmm5 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: vmovaps (%rax), %xmm11 ; AVX2-NEXT: vmovaps (%r10), %xmm12 @@ -2629,18 +2627,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm3 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-FP-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FP-NEXT: vmovaps %xmm2, %xmm5 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vmovaps (%rax), %xmm11 ; AVX2-FP-NEXT: vmovaps (%r10), %xmm12 @@ -2879,18 +2875,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm3 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-FCP-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm11 ; AVX2-FCP-NEXT: vmovaps (%r10), %xmm12 @@ -6480,26 +6474,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -6774,26 +6763,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -7068,26 +7052,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -7362,26 +7341,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -7656,26 +7630,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -7950,26 +7919,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -8244,26 +8208,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -8538,26 +8497,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 @@ -13784,21 +13738,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -14438,21 +14389,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -15092,21 +15040,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -15746,21 +15691,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -16400,21 +16342,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -17054,21 +16993,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -17708,21 +17644,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm6 @@ -18362,21 +18295,18 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index ffdbdea024ea0..8dded96acfc12 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -7461,10 +7461,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX-NEXT: vmovaps %ymm1, %ymm10 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -7477,10 +7476,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX-NEXT: vmovaps %ymm1, %ymm9 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 384(%rdi), %ymm9 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -7500,9 +7498,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX-NEXT: vmovapd %ymm1, %ymm13 +; AVX-NEXT: vmovapd 480(%rdi), %ymm13 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2] ; AVX-NEXT: vmovapd 480(%rcx), %xmm1 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index 99bcebd28f120..e70975addc676 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -13430,21 +13430,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -13457,7 +13456,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13468,7 +13467,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13479,7 +13478,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13490,7 +13489,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13501,7 +13500,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13513,35 +13512,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -14125,21 +14123,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -14152,7 +14149,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14163,7 +14160,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14174,7 +14171,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14185,7 +14182,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14196,7 +14193,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14208,35 +14205,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -14820,21 +14816,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -14847,7 +14842,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14858,7 +14853,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14869,7 +14864,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14880,7 +14875,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14891,7 +14886,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14903,35 +14898,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -15515,21 +15509,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -15542,7 +15535,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15553,7 +15546,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15564,7 +15557,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15575,7 +15568,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15586,7 +15579,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15598,35 +15591,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -16210,21 +16202,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -16237,7 +16228,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16248,7 +16239,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16259,7 +16250,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16270,7 +16261,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16281,7 +16272,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16293,35 +16284,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -16905,21 +16895,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -16932,7 +16921,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16943,7 +16932,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16954,7 +16943,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16965,7 +16954,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16976,7 +16965,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16988,35 +16977,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -17600,21 +17588,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -17627,7 +17614,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17638,7 +17625,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17649,7 +17636,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17660,7 +17647,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17671,7 +17658,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17683,35 +17670,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm10 @@ -18295,21 +18281,20 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 @@ -18322,7 +18307,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18333,7 +18318,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm15, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18344,7 +18329,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18355,7 +18340,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18366,7 +18351,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18378,35 +18363,34 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 89642492f83a8..48e1d0a7fb930 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -8013,26 +8013,23 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa (%r9), %ymm10 ; AVX512-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill @@ -8456,25 +8453,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 @@ -8491,11 +8485,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 @@ -8898,25 +8891,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill @@ -9337,35 +9327,31 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 @@ -9771,26 +9757,23 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa (%r9), %ymm10 ; AVX512BW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill @@ -10214,25 +10197,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 @@ -10249,11 +10229,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 @@ -10656,25 +10635,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512DQ-BW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill @@ -11095,35 +11071,31 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 @@ -16930,8 +16902,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm9 @@ -16959,16 +16931,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 @@ -16986,15 +16956,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] +; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] @@ -17002,10 +16970,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] @@ -17027,8 +16994,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 @@ -17066,8 +17033,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 @@ -17088,31 +17055,28 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512-NEXT: vmovdqa 192(%r9), %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 @@ -17892,8 +17856,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 @@ -17908,10 +17872,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] +; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: movb $96, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 @@ -17921,20 +17885,17 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -17962,9 +17923,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] @@ -17972,9 +17933,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] @@ -17993,7 +17954,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 @@ -18002,14 +17963,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 ; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm1 @@ -18033,8 +17992,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 @@ -18053,9 +18012,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 ; AVX512-FCP-NEXT: vmovdqa 192(%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 @@ -18072,7 +18030,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 @@ -18090,19 +18048,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqa 256(%r9), %ymm10 ; AVX512-FCP-NEXT: vmovdqa 256(%r8), %ymm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 @@ -18132,9 +18089,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqa 320(%r8), %ymm14 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 @@ -18852,8 +18808,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm9 @@ -18881,17 +18837,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 @@ -18913,10 +18866,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -18970,25 +18922,24 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm8 ; AVX512DQ-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm8, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 @@ -19010,28 +18961,27 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqa 192(%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 @@ -19806,8 +19756,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 @@ -19835,17 +19785,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 @@ -19863,11 +19810,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 @@ -19902,7 +19848,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 @@ -19926,9 +19872,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 128(%r8), %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %ymm30 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 @@ -19962,9 +19907,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 @@ -20759,8 +20703,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 @@ -20788,16 +20732,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 @@ -20815,15 +20757,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] @@ -20831,10 +20771,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] @@ -20856,8 +20795,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 @@ -20895,8 +20834,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 @@ -20917,31 +20856,28 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512BW-NEXT: vmovdqa 192(%r9), %ymm5 ; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 @@ -21721,8 +21657,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 @@ -21737,10 +21673,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] +; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: movb $96, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 @@ -21750,20 +21686,17 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -21791,9 +21724,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] @@ -21801,9 +21734,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] @@ -21822,7 +21755,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 @@ -21831,14 +21764,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm1 @@ -21862,8 +21793,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 @@ -21882,9 +21813,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa 192(%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 @@ -21901,7 +21831,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 @@ -21919,19 +21849,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 256(%r9), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 256(%r8), %ymm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm16[2,3,2,3] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 @@ -21961,9 +21890,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 320(%r8), %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 @@ -22681,8 +22609,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $6280, %rsp # imm = 0x1888 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm9 @@ -22710,17 +22638,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 @@ -22742,10 +22667,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -22799,25 +22723,24 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 @@ -22839,28 +22762,27 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 192(%r9), %ymm8 ; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 @@ -23635,8 +23557,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 @@ -23664,17 +23586,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 @@ -23692,11 +23611,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 @@ -23731,7 +23649,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 @@ -23755,9 +23673,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r8), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 @@ -23791,9 +23708,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm24[0],ymm9[2],ymm24[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 2936b55ef6ed4..b87abded10819 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -977,13 +977,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm9 ; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm1 @@ -996,9 +995,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] @@ -1653,7 +1651,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rsi), %xmm7 @@ -1677,8 +1675,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] @@ -2004,9 +2001,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm2 -; AVX-NEXT: vmovaps %ymm1, %ymm7 +; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm2 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -3783,29 +3779,25 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX: # %bb.0: ; AVX-NEXT: subq $104, %rsp ; AVX-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, %xmm14 +; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX-NEXT: # xmm14 = mem[0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm4 -; AVX-NEXT: vmovaps %ymm5, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX-NEXT: # xmm5 = mem[0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX-NEXT: vmovdqa %xmm5, %xmm10 +; AVX-NEXT: vmovddup {{.*#+}} xmm10 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX-NEXT: # xmm10 = mem[0,0] +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm6 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX-NEXT: # xmm8 = mem[0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm7 -; AVX-NEXT: vmovdqa %xmm8, %xmm12 +; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm7 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] @@ -3831,10 +3823,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] -; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX-NEXT: # xmm6 = mem[0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm6, %xmm15 +; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX-NEXT: # xmm15 = mem[0,0] +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm1 @@ -3891,17 +3882,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] ; AVX-NEXT: # xmm13 = mem[0,0] ; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX-NEXT: # xmm9 = mem[0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm5 +; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX-NEXT: # xmm15 = mem[0,0] +; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa %xmm5, %xmm12 +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] ; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5 ; AVX-NEXT: vmovdqa %xmm10, %xmm7 @@ -3937,8 +3927,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa 16(%rsi), %xmm8 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm9, %xmm15 +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 @@ -4107,7 +4096,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i8_stride5_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $248, %rsp -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 @@ -4161,9 +4150,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-NEXT: vpshufb %ymm15, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm1 +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm3 @@ -4369,7 +4357,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 @@ -4383,9 +4371,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm14, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm11 @@ -4538,7 +4525,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $168, %rsp ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4592,9 +4579,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 7beba6e30cf4c..4d9cc3c8a7dcb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -605,11 +605,10 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm7 = mem[0],zero ; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE-NEXT: movdqa %xmm3, %xmm4 @@ -2380,10 +2379,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm8 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm10 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] @@ -2516,10 +2514,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm8 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm10 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] @@ -4047,9 +4044,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX-NEXT: vmovdqa %xmm7, %xmm8 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4065,9 +4061,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vandnps %ymm4, %ymm10, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u] -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm6 -; AVX-NEXT: vmovdqa %xmm7, %xmm15 +; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u] +; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm6 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] ; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] @@ -4138,9 +4133,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm15 +; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm2 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] @@ -4178,10 +4172,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX-NEXT: vmovaps %ymm13, %ymm15 +; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128] @@ -4432,26 +4425,23 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa %xmm2, %xmm8 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa %xmm3, %xmm11 -; AVX2-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 @@ -4464,9 +4454,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-NEXT: vmovdqa %xmm12, %xmm14 -; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] @@ -4495,18 +4484,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] ; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm6 -; AVX2-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm6 ; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] -; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX2-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 @@ -4535,10 +4522,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX2-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4763,7 +4749,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4771,12 +4757,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm4 @@ -4785,9 +4770,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] @@ -4813,20 +4797,18 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] @@ -4838,9 +4820,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm5 @@ -5088,7 +5069,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5096,12 +5077,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm4 @@ -5110,9 +5090,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm14 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] @@ -5138,20 +5117,18 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] @@ -5163,9 +5140,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 324d1ceef1012..98a64ee987f7b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1374,7 +1374,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1399,8 +1399,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm3 @@ -1432,10 +1431,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm4, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa (%rax), %xmm7 +; SSE-NEXT: movdqa (%rax), %xmm15 ; SSE-NEXT: por %xmm10, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm11, %xmm7 @@ -2680,7 +2678,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rdx), %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm7 @@ -2688,8 +2686,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 16(%r8), %xmm6 ; SSE-NEXT: movdqa 16(%r9), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm0 @@ -2846,10 +2843,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] ; SSE-NEXT: movdqa %xmm12, %xmm3 @@ -2860,10 +2856,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] @@ -3443,10 +3438,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa (%r9), %xmm2 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa %xmm2, %xmm7 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%r9), %xmm7 +; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%r8), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm2 @@ -3771,12 +3765,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] @@ -3794,12 +3788,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero -; AVX2-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] ; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 @@ -5320,22 +5312,20 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa 48(%rsi), %xmm11 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa 48(%rcx), %xmm10 -; SSE-NEXT: movdqa 48(%r8), %xmm5 +; SSE-NEXT: movdqa 48(%r8), %xmm9 ; SSE-NEXT: movdqa 48(%r9), %xmm8 ; SSE-NEXT: movdqa 48(%rax), %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] @@ -5343,15 +5333,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -5359,8 +5348,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm5 @@ -5567,10 +5555,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa (%r8), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 @@ -6731,17 +6718,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa 16(%rax), %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX-NEXT: vmovdqa %xmm4, %xmm8 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm3, %xmm10 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -6784,7 +6769,6 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u] ; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX-NEXT: vmovdqa %xmm4, %xmm6 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] @@ -6798,10 +6782,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3 -; AVX-NEXT: vmovdqa %xmm4, %xmm13 +; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] @@ -6817,19 +6800,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm0 ; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX-NEXT: vmovdqa %xmm2, %xmm8 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX-NEXT: vmovdqa %xmm4, %xmm11 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm3, %xmm10 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6852,9 +6833,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX-NEXT: vmovdqa %xmm12, %xmm9 +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] ; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 @@ -7057,12 +7037,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u,u,u,u],zero +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] ; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX-NEXT: vmovdqa %xmm1, %xmm8 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] ; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm11 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] @@ -7099,13 +7078,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] ; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovaps %ymm1, %ymm8 +; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] @@ -7130,8 +7108,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX-NEXT: vmovdqa %xmm7, %xmm9 +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7359,10 +7336,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-NEXT: vmovdqa %xmm2, %xmm14 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -7391,9 +7367,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] @@ -7716,19 +7691,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero @@ -7779,20 +7752,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm11 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] @@ -7807,10 +7777,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] @@ -7940,15 +7909,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm4, %ymm6, %ymm4 @@ -7962,10 +7929,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm2, %ymm7, %ymm7 -; AVX2-FP-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] @@ -8133,19 +8099,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero @@ -8196,12 +8160,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-FCP-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -8355,15 +8318,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 @@ -8377,10 +8338,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm2, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] @@ -8960,9 +8920,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] @@ -9748,9 +9707,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 20c245aa08663..3a70df7617f18 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -3240,19 +3240,16 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -3393,19 +3390,16 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -3774,9 +3768,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm3 = [2312,2826,3340,3854] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 @@ -4134,16 +4127,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm4 = [1284,1798] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] @@ -5657,11 +5648,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8 -; AVX-NEXT: vmovaps %ymm11, %ymm12 +; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 ; AVX-NEXT: vorps %ymm9, %ymm8, %ymm8 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,4,6,5] ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7] @@ -6403,16 +6393,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] @@ -6727,16 +6714,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] @@ -7530,12 +7514,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 @@ -8306,12 +8288,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 834ea7385d312..85e782e908349 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -6443,10 +6443,9 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $2, %k5, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k3 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF @@ -6665,8 +6664,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k4, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $21, %k1, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $21, %k7, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k2, %k3 @@ -6683,8 +6682,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 -; AVX512BW-NEXT: kmovq %k1, %k7 +; AVX512BW-NEXT: kshiftrd $22, %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 @@ -7053,12 +7051,11 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k5, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 -; AVX512BW-NEXT: kmovq %k2, %k5 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload @@ -7879,8 +7876,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $16, %k7, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -7904,8 +7901,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $17, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8265,8 +8261,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $32, %k2, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $32, %k7, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8290,8 +8286,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kshiftrq $33, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8651,8 +8646,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $48, %k7, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8675,8 +8670,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $49, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -9905,17 +9899,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k6, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 -; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -9990,12 +9983,11 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k6, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k2, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $23, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $22, %k6, %k5 ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k2, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 @@ -10173,13 +10165,12 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 277525796824b..3c98eba69ae5b 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -4096,10 +4096,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: movzbl %cl, %ecx -; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi -; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx -; FALLBACK20-NEXT: movl %ecx, %edi +; FALLBACK20-NEXT: movzbl %cl, %edi +; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi +; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi @@ -4423,10 +4422,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: movzbl %cl, %ecx -; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi -; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx -; FALLBACK24-NEXT: movl %ecx, %edi +; FALLBACK24-NEXT: movzbl %cl, %edi +; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi +; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi @@ -4745,10 +4743,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: movzbl %cl, %ecx -; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi -; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx -; FALLBACK28-NEXT: movl %ecx, %edi +; FALLBACK28-NEXT: movzbl %cl, %edi +; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi +; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi @@ -6922,15 +6919,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl ; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %eax -; FALLBACK20-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK20-NEXT: movsbl %cl, %ebx +; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: movb %dh, %dl ; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 80(%esp,%eax), %esi -; FALLBACK20-NEXT: movl %eax, %ebx +; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK20-NEXT: movl %esi, %eax ; FALLBACK20-NEXT: shrl %eax ; FALLBACK20-NEXT: movl %edx, %ecx @@ -7250,15 +7246,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl ; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %eax -; FALLBACK24-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK24-NEXT: movsbl %cl, %ebx +; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movb %dh, %dl ; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 80(%esp,%eax), %esi -; FALLBACK24-NEXT: movl %eax, %ebx +; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK24-NEXT: movl %esi, %eax ; FALLBACK24-NEXT: shrl %eax ; FALLBACK24-NEXT: movl %edx, %ecx @@ -7573,15 +7568,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl ; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %eax -; FALLBACK28-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK28-NEXT: movsbl %cl, %ebx +; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: movb %dh, %dl ; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 80(%esp,%eax), %esi -; FALLBACK28-NEXT: movl %eax, %ebx +; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK28-NEXT: movl %esi, %eax ; FALLBACK28-NEXT: shrl %eax ; FALLBACK28-NEXT: movl %edx, %ecx diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 8c0873492ce40..43d2a997c81d2 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -1581,11 +1581,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax @@ -2141,8 +2140,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al ; X86-NO-BMI2-NO-SHLD-NEXT: negb %al ; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl @@ -2150,7 +2149,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp @@ -2346,12 +2344,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp @@ -2716,10 +2713,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch @@ -4636,13 +4632,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 044be12a39543..fbbf2a6c127a5 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -3270,11 +3270,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movl %ecx, %edi ; X86-SHLD-NEXT: andl $60, %edi ; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx -; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax -; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl 20(%esp,%edi), %esi +; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: movl %eax, %esi ; X86-SHLD-NEXT: movl %edx, %eax ; X86-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill